#Programming Historian Text Mining YouTube Comment Data with Wordfish in R #set up virtual enviroment and install quanteda install.packages("renv") renv::init() renv::install("quanteda.textmodels@0.9.6") #install packages install.packages(c("tidyverse", "quanteda", "quanteda.textplots")) #load packages library(tidyverse); library(lubridate); library(ggplot2); library(purrr), library(stringr) #load comments files downloaded from YouTube Data Tools and add videoId column from file name comment_files <- list.files(path = "ytdt_data/", recursive = TRUE, pattern = "\\comments.csv$", full.names = TRUE) comment_files all_comments <- read_csv(comment_files, id = "videoId", col_select = c( commentId = id, authorName, commentText = text), show_col_types = FALSE) %>% suppressWarnings() all_comments$videoId <- str_extract( all_comments$videoId, "(?<=ytdt_data\\/).+(?=\\/videoinfo)" ) all_comments #load in video files downloaded from YouTube Data Tools containing video metadata video_files <- list.files(path = "ytdt_data/", recursive = TRUE, pattern = "basicinfo\\.csv$", full.names = TRUE) video_files #pivoting, so data organized by row rather than column all_videos <- read_csv(video_files, col_names = FALSE, id = "videoId", show_col_types = FALSE) %>% mutate(videoId = str_extract(videoId, "(?<=ytdt_data\\/).+(?=\\/videoinfo)")) %>% pivot_wider(names_from = X1, values_from = X2) %>% select(videoId, videoChannelTitle = channelTitle, videoTitle = title, commentCount) #join video and comment data all_data <- inner_join(all_comments, all_videos) count(all_data, sort(videoChannelTitle)) #add partisan indicator as factor for later visualization #NOTE - users should update this according to their own datasets and classifications all_data$partisan <- all_data$videoChannelTitle all_data <- all_data |> mutate(partisan = as.factor(case_when( partisan %in% c("Ben Shapiro", "New York Post", "Fox News", "DailyWire+") ~ "right", partisan == "NBC News" ~ "left", TRUE ~ partisan)) ) glimpse(all_data) #calling this library and related packages when needed and later than other packages to avoid conflicts library(quanteda) #clean the Data #create custom stopword list, including both custom and quanteda stopwords #remove stopwords, convert to lowercase, and remove punctuation and html my_stopwords <- c(stopwords("en"), "brostein", "derrick", "camry") all_data$text <- all_data$commentText %>% str_to_lower() %>% str_remove_all(str_c("\\b", my_stopwords, "\\b", collapse = "|")) all_data$text <- all_data$text %>% str_remove_all("[:punct:]||'|[$]") %>% str_remove_all("[@][\\w_-]+|[#][\\w_-]+|http\\S+\\s*||[:digit:]*|") #remove any duplicate rows all_data <- all_data %>% unique() print(paste(nrow(all_data), "comments remaining")) #create new column with duplicate words removed from each comment all_data$uniqueWords <- sapply(str_split(all_data$text, " "), function(x) paste(unique(x), collapse = " ")) #remove any comments with less than 10 words all_data <- all_data %>% mutate( numbWords = str_count(all_data$uniqueWords, boundary("word"))) %>% filter( numbWords >= 10) print(paste(nrow(all_data), "comments remaining")) #export cleaned YouTube comment data write.csv(all_data, "all_data.csv") #selecting Comments for the Corpus wfAll <- select(all_data, commentId, uniqueWords, videoChannelTitle, partisan, numbWords) #building the Corpus options(width = 110) corp_all <- corpus(wfAll, docid_field = "commentId", text_field = "uniqueWords") summary(docvars(corp_all)) #tokenization and DFM Creation toks_all <- tokens(corp_all, remove_punct = TRUE, remove_symbols = TRUE, remove_numbers = TRUE, remove_url = TRUE, remove_separators = TRUE) #optimizing the Corpus for Wordfish dfmat_all <- dfm(toks_all) print(paste("you created", "a dfm with", ndoc(dfmat_all), "documents and", nfeat(dfmat_all), "features")) dfmat_all <- dfm_keep(dfmat_all, min_nchar = 4) dfmat_all <- dfm_trim(dfmat_all, min_docfreq = 0.01, min_termfreq = 0.0001, termfreq_type = "prop") print(dfmat_all) #list of most frequent 25 words topWords <- topfeatures(dfmat_all, 25, decreasing = TRUE) %>% names() %>% sort() topWords library(quanteda.textmodels) #run the Wordfish model tmod_wf_all <- textmodel_wordfish(dfmat_all, dispersion = "poisson", sparse = TRUE, residual_floor = 0.5, dir=c(2,1)) summary(tmod_wf_all) library(quanteda.textplots) #plot all features wf_feature_plot <- textplot_scale1d(tmod_wf_all, margin = "features") + labs(title = "Wordfish Model Visualization - Feature Scaling", x = "Estimated beta", y= "Estimated psi") wf_feature_plot #remove any additional stopwords more_stopwords <- c("edward", "bombed", "calmly") # Removing these to focus on main visualization (remove tails) dfmat_all <- dfm_remove(dfmat_all, pattern = more_stopwords) #run the Wordfish model after removing additional stopwords tmod_wf_all <- textmodel_wordfish(dfmat_all, dispersion = "poisson", sparse = TRUE, residual_floor = 0.5, dir=c(2,1)) summary(tmod_wf_all) #plot all features wf_feature_plot_more_stopwords <- textplot_scale1d(tmod_wf_all, margin = "features") + labs(title = "Wordfish Model Visualization - Feature Scaling", x = "Estimated beta", y= "Estimated psi") wf_feature_plot_more_stopwords #save the final version of the feature scaling visualization ggsave("Wordfish Model Visualization - Feature Scaling.jpg", plot=wf_feature_plot_more_stopwords) #create data object for comment plot wf_comment_df <- tibble( theta = tmod_wf_all[["theta"]], alpha = tmod_wf_all[["alpha"]], partisan = as.factor(tmod_wf_all[["x"]]@docvars$partisan) ) #visualize the comment data wf_comment_plot <- ggplot(wf_comment_df) + geom_point(aes(x = theta, y = alpha, color = partisan), shape = 1) + scale_color_manual(values = c("blue", "red")) + labs(title = "Wordfish Model Visualization - Comment Scaling", x = "Estimated theta", y= "Estimated alpha") wf_comment_plot