library(readr) library(tidyverse) library(Rtsne) library(ggplot2) library(plotly) library(lubridate) library(stringr) library(gtable) library(classInt) library(scales) library(gridExtra) library(forcats) library(patchwork) set.seed(2000) dat <- read_csv("S2_Data.csv") first_topic <- dat[,c(1, 7:31)] %>% gather(col, val, -message_id) %>% group_by(message_id) %>% arrange(val) %>% top_n(1) %>% do(head(., 1)) %>% rename( first = col, first_prob = val ) theme.labs <- c("Scientific and Medical Basis of COVID-19", "COVID-19 Vaccine", "COVID-19 Mitigation Strategies", "Society and Institutions", "Family and Personal Relationships", "Navigating the COVID-19 Infodemic") topic.labs <- c("Immunology", "Disease Process", "Immunity and Transmission", "Viral Variants", "Other Medical Conditions", "Dosage and Timing", "Development and Rollout", "Behavior Surr. Vaccination", "Safety and Side Effects", "Socializing Safely", "Testing and Isolation", "Ventilation", "Sanitation and Hygiene", "Masks", "Education", "Healthcare and Policy", "Social Participation", "Interacting with Family", "Children and Parenting", "Personal Relationships", "Verifying Information", "Feedback", "Data and Statistics", "Sense-making", "Risk Assessment") names(topic.labs) <- c("a_immunology", "b_disease_process", "c_immunity_transmission", "d_viral_variant", "e_other_conditions", "f_dosage_timing", "g_development_rollout", "h_behavior_vaccination", "i_safety_sideeffects", "j_socializing_safely", "j_testing_isolation", "k_ventilation", "l_sanitation_hygiene", "m_masking", "n_education", "o_healthcare_policy", "p_travel", "q_interacting_family", "r_children_parenting", "s_personal", "t_verifying_info", "u_dear_pandemic", "v_data_statistics", "w_sensemaking", "x_risk_assessment") fake.labs <- c(expression(bold("Scientific and Medical Basis of COVID-19")),"Immunology", "Disease Process", "Immunity and Transmission", "Viral Variants", "Other Medical Conditions", expression(bold("COVID-19 Vaccine")),"Dosage and Timing", "Development and Rollout", "Behavior Surr. Vaccination", "Safety and Side Effects", expression(bold("COVID-19 Mitigation Strategies")),"Socializing Safely", "Testing and Isolation", "Ventilation", "Sanitation and Hygiene", "Masks", expression(bold("Society and Institutions")),"Education", "Healthcare and Policy", "Social Participation", expression(bold("Family and Personal Relationships")),"Interacting with Family", "Children and Parenting", "Personal Relationships", expression(bold("Navigating the COVID-19 Infodemic")),"Verifying Information", "Feedback", "Data and Statistics", "Sense-making", "Risk Assessment") names(fake.labs) <- c("0_theme1", "a_immunology", "b_disease_process", "c_immunity_transmission", "d_viral_variant", "e_other_conditions", "ea_theme2", "f_dosage_timing", "g_development_rollout", "h_behavior_vaccination", "i_safety_sideeffects", "ia_theme3", "j_socializing_safely", "j_testing_isolation", "k_ventilation", "l_sanitation_hygiene", "m_masking", "ma_theme4", "n_education", "o_healthcare_policy", "p_travel", "pa_theme5", "q_interacting_family", "r_children_parenting", "s_personal", "sa_theme6", "t_verifying_info", "u_dear_pandemic", "v_data_statistics", "w_sensemaking", "x_risk_assessment") msg_tops <- dat[,c(1, 7:31)] msg_tops <- msg_tops[!duplicated(msg_tops),] msg_tops <- left_join(msg_tops, first_topic, by="message_id") msg_tops$first <- as.factor(msg_tops$first) msg_tops <- msg_tops[, -1] msg_tops <- msg_tops %>% mutate(theme = ifelse(first == "a_immunology" | first == "b_disease_process" | first == "c_immunity_transmission" | first == "d_viral_variant" | first == "e_other_conditions", "a_scientific_medical", ifelse(first == "f_dosage_timing" | first == "g_development_rollout" | first == "h_behavior_vaccination" | first == "i_safety_sideeffects", "b_vaccine", ifelse(first == "j_socializing_safely" | first == "k_ventilation" | first == "m_masking" | first == "j_testing_isolation" | first == "l_sanitation_hygiene", "c_mitigation", ifelse(first == "p_travel" | first == "o_healthcare_policy" | first == "n_education", "d_society", ifelse(first == "r_children_parenting" | first == "s_personal" | first == "q_interacting_family", "e_family_personal", ifelse(first =="t_verifying_info" | first =="u_dear_pandemic"| first =="v_data_statistics"| first == "w_sensemaking" | first =="x_risk_assessment", "f_information", "faketheme"))))))) ##### Submission characteristics ##### mean(str_count(dat$orig, '\\w+'), na.rm=TRUE) hist(str_count(dat$orig, '\\W+'), breaks=100) sd(str_count(dat$orig, '\\W+'), na.rm=TRUE) mean(str_count(dat$message, '\\w+'), na.rm=TRUE) hist(str_count(dat$message, '\\W+'), breaks=100) sd(str_count(dat$message, '\\W+'), na.rm=TRUE) sum(str_count(dat$orig, '.(http)'), na.rm=TRUE) ##### S2 Fig. Daily submissions to the Dear Pandemic question box, August 24, 2020 to August 24, 2021 ##### counts_byday <- dat %>% group_by(date) %>% summarise(n=n()) counts_byday$date <- as.Date(counts_byday$date, format="%m/%d/%Y") byday <- ggplot(counts_byday, aes(x=date, y=n)) + geom_line() + geom_smooth(method="gam", color="red") + theme_bw() + labs(x="Date", y="Number of submissions per day") ggsave("S2_Fig.tiff", byday) ##### S1 Data. Submissions to the Dear Pandemic question box by location ##### location_counts <- dat %>% group_by(location) %>% summarise(n=n()) write_csv(location_counts, "S1_Data.csv") ##### Fig 1. t-SNE visualization of submissions to the Dear Pandemic question box ##### tsne <- Rtsne(msg_tops[,-c(26:28)], dims = 2, perplexity=30, verbose=TRUE, theta=0, max_iter = 1000) first_tsne_df <- data.frame(x = tsne$Y[,1], y = tsne$Y[,2], col=msg_tops$theme, shp = msg_tops$first) fake_rows <- data.frame(x=c(0, 0, 0, 0, 0, 0), y=c(0, 0, 0, 0, 0, 0), col=c("faketheme", "faketheme", "faketheme", "faketheme", "faketheme", "faketheme"), shp=c("0_theme1", "ea_theme2", "ia_theme3", "ma_theme4", "pa_theme5", "sa_theme6")) first_tsne_df <- rbind(fake_rows, first_tsne_df) first_tsne_plot <- ggplot(first_tsne_df) + geom_point(aes(x=x, y=y, color=shp, shape=shp), size=2) + theme_bw() + scale_shape_manual(name="Highest-probability Topic", label=fake.labs, values=c(1, 15, 16, 17, 1, 8, 1, 15, 16, 17, 1, 1, 15, 16, 17, 1, 8, 1, 15, 16, 17, 1, 15, 16, 17, 1, 15, 16, 17, 1, 8), guide=guide_legend(ncol=1)) + scale_color_manual(name="Highest-probability Topic", label=fake.labs, values=c("white", '#F8766D', '#F8766D', '#F8766D', '#F8766D', '#F8766D', "white", '#B79F00', '#B79F00', '#B79F00', '#B79F00', "white", '#00BA38', '#00BA38', '#00BA38', '#00BA38', '#00BA38', "white", '#00BFC4', '#00BFC4', '#00BFC4', "white", '#619CFF', '#619CFF', '#619CFF', "white", '#F564E3', '#F564E3', '#F564E3', '#F564E3', '#F564E3'), guide=guide_legend(ncol=1)) + labs(x="t-SNE Dimension 1", y="t-SNE Dimension 2") ggsave("Fig1.tiff", first_tsne_plot, width=12, height=8) ##### Fig 2. Trends in topic prevalence from August 24, 2020 to August 24, 2021 ##### dat$date <- as.Date(dat$date, format = "%m/%d/%Y") avg_byday <- dat[, c(2, 6:31)] avg_byday <- avg_byday %>% group_by(date) %>% summarize(q_interacting_family = mean(q_interacting_family), h_behavior_vaccination = mean(h_behavior_vaccination), a_immunology = mean(a_immunology), j_socializing_safely = mean(j_socializing_safely), i_safety_sideeffects = mean(i_safety_sideeffects), v_data_statistics = mean(v_data_statistics), x_risk_assessment = mean(x_risk_assessment), r_children_parenting = mean(r_children_parenting), d_viral_variant = mean(d_viral_variant), u_dear_pandemic = mean(u_dear_pandemic), m_masking = mean(m_masking), l_sanitation_hygiene = mean(l_sanitation_hygiene), k_ventilation = mean(k_ventilation), f_dosage_timing = mean(f_dosage_timing), e_other_conditions = mean(e_other_conditions), g_development_rollout = mean(g_development_rollout), j_testing_isolation = mean(j_testing_isolation), n_education = mean(n_education), t_verifying_info = mean(t_verifying_info), o_healthcare_policy = mean(o_healthcare_policy), b_disease_process = mean(b_disease_process), w_sensemaking = mean(w_sensemaking), c_immunity_transmission = mean(c_immunity_transmission), p_travel = mean(p_travel), s_personal = mean(s_personal)) avg_byday <- avg_byday %>% pivot_longer(!date, names_to = "topic", values_to = "avg_prob") avg_byday <- avg_byday %>% mutate(theme = ifelse(topic == "a_immunology" | topic == "b_disease_process" | topic == "c_immunity_transmission" | topic == "d_viral_variant" | topic == "e_other_conditions", "a_scientific_medical", ifelse(topic == "f_dosage_timing" | topic == "g_development_rollout" | topic == "h_behavior_vaccination" | topic == "i_safety_sideeffects", "b_vaccine", ifelse(topic == "j_socializing_safely" | topic == "k_ventilation" | topic == "m_masking" | topic == "j_testing_isolation" | topic == "l_sanitation_hygiene", "c_mitigation", ifelse(topic == "p_travel" | topic == "o_healthcare_policy" | topic == "n_education", "d_society", ifelse(topic == "r_children_parenting" | topic == "s_personal" | topic == "q_interacting_family", "e_family_personal", "f_information")))))) avg_byday$topic <- as.factor(avg_byday$topic) levels(avg_byday$topic) <- topic.labs avg_byday$theme <- factor(avg_byday$theme) levels(avg_byday$theme) <- theme.labs colors <- c("#F8766D", "#B79F00", "#00BA38", "#00BFC4", "#619CFF", "#F564E3") names(colors) <- theme.labs p <- avg_byday %>% split(.$theme) %>% purrr::imap(function(.data, .title) { ggplot(.data, aes(x=date, y=avg_prob)) + stat_smooth(method='gam', color=colors[.title]) + facet_wrap(~topic, nrow=1) + theme_bw() + scale_x_date(date_labels = "%b %y") + theme(legend.position = "none") + labs(title = .title, x="Date", y="Mean Daily Prob.") }) ncat <- avg_byday %>% group_by(theme) %>% summarise(n = n_distinct(topic)) %>% tibble::deframe() ncat_max <- max(ncat) p <- purrr::imap(p, function(x, y) { ncat <- ncat[[y]] n_spacer <- ncat_max - ncat x + plot_spacer() + plot_layout(ncol = 2, widths = c(ncat, n_spacer)) }) plt <- wrap_plots(p, nrow = length(p)) ggsave("Fig2.tiff", plt, width=10, height=12)