R Scripts- 100 days of Tweets

Bruce’s R Script:

library(jsonlite)
library(tidyverse)
library(stringr)
library(tidytext)
library(lubridate)

#Get data from Trump Twitter Archive and make dataframe from JSON data
trump <- fromJSON(“~/Documents/condensed_2017.json”)

trump$id <- trump$id_str %>% as.numeric(length = 18)

trump100 <- filter(trump, id >= 822421390125043713 & id <= 858375971019399169)
trump100 <- filter(trump100, substr(text, start = 1, stop = 4) != “RT @”)

trump100$text_time <- as.POSIXct(strptime(trump100$created_at, “%a %b %d %H:%M:%S %z %Y”))

#Select variables
tt <- select(trump100, text, text_time, id)

#Unnest Trump Tweets
ttc <- tt %>%
unnest_tokens(word, text)

#Remove Stop Words
tm_stopwords <- c(tm::stopwords(“en”), “will”, “s”, “u”, “m”, “today”, “just”, “thank”, “now”, “one”, “trump”, “https”, “t.co”, “a.m”, “p.m”, “amp”, “9”, “00”)
tm_stop <- data_frame(word = tm_stopwords)

#remove other noise

replace_reg <- “https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&amp;|&lt;|&gt;|RT|https”
unnest_reg <- “([^A-Za-z_\\d#@’]|'(?![A-Za-z_\\d#@]))”
ttc <- tt %>%
filter(!str_detect(text, “^RT”)) %>%
mutate(text = str_replace_all(text, replace_reg, “”)) %>%
unnest_tokens(word, text, token = “regex”, pattern = unnest_reg) %>%
filter(!word %in% tm_stop$word,
str_detect(word, “[a-z]”))

#top 10 words
count(ttc, word, sort = TRUE)
count <- data.frame(count(ttc, word, sort = TRUE))

ttc %>%
count(word, sort = TRUE) %>%
filter(n > 20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(colour = “#1605B2”, fill = “#1605B2”) +
labs(x = NULL, y = “Frequency”) +
coord_flip()

#Sentiment Analysis
#Add row_num
ttc$row_n <- 1:5113

ttc_sentwhole <- ttc %>%
inner_join(get_sentiments(“bing”)) %>%
count(id, time = row_n %/% 5113, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive – negative)

mean(ttc_sentwhole$sentiment)
sd(ttc_sentwhole$sentiment)

ggplot(ttc_sentwhole, aes(sentiment)) +
geom_bar(colour = “#1605B2”, fill = “#1605B2”) +
labs(y = “Count”, x = “Sentiment of Words of Trump Tweets”)

ttc_s1 <- ttc %>%
inner_join(get_sentiments(“bing”)) %>%
count(id, time = time1 %/% 40, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive – negative)

ttc_s1 <- ttc %>%
inner_join(get_sentiments(“bing”)) %>%
count(id, time = text_time, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive – negative)

ggplot(ttc_s1, aes(time, sentiment)) +
geom_smooth(show.legend = FALSE) +
labs(y = “Sentiment”, x = “Date”)

bing_word_counts <- ttc%>%
inner_join(get_sentiments(“bing”)) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()

bing_word_counts

bing_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = “free_y”) +
labs(y = “Contribution to Sentiment”,
x = NULL) +
coord_flip()

#Wordcloud
library(wordcloud)

ttc %>%
count(word) %>%
with(wordcloud(word, n, max.words = 75))

library(reshape2)

ttc %>%
inner_join(get_sentiments(“bing”)) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = “n”, fill = 0) %>%
comparison.cloud(colors = c(“#E90C0C”, “#03AB3B”),
max.words = 100)

#ngrams

tbigrams <- tt%>%
unnest_tokens(bigram, text, token = “ngrams”, n = 2)

tbigrams %>%
count(bigram, sort = TRUE)

library(tidyr)

tbigrams_separated <- tbigrams %>%
separate(bigram, c(“word1”, “word2″), sep = ” “)

tbigrams_filtered <- tbigrams_separated %>%
filter(!word1 %in% tm_stop$word) %>%
filter(!word2 %in% tm_stop$word)

# new bigram counts:
tbigram_counts <- tbigrams_filtered %>%
count(word1, word2, sort = TRUE)

tbigram_counts

tbigrams_united <- tbigrams_filtered %>%
unite(bigram, word1, word2, sep = ” “)

tbigrams_united$bigram

tbigrams_united %>%
count(bigram, sort = TRUE) %>%
filter(n > 5) %>%
mutate(bigram = reorder(bigram, n)) %>%
ggplot(aes(bigram, n)) +
geom_col(colour = “#1605B2”, fill = “#1605B2”) +
labs(x = NULL, y = “Frequency”) +
coord_flip()

count(tbigrams_united, bigram, sort = TRUE)

#trigrams
tt %>%
unnest_tokens(trigram, text, token = “ngrams”, n = 3) %>%
separate(trigram, c(“word1”, “word2”, “word3″), sep = ” “) %>%
filter(!word1 %in% tm_stop$word,
!word2 %in% tm_stop$word,
!word3 %in% tm_stop$word) %>%
count(word1, word2, word3, sort = TRUE)

#Get word sssociations
library(tm)
library(plyr)

#Building a corpus of text

Encoding(tt$text) <- “latin1”

# Make a volatile corpus
tt_source <- VectorSource(tt$text)

tt_corpus <- VCorpus(tt_source)

#make all lower
tt_corpus <- tm_map(tt_corpus, content_transformer(tolower), lazy = TRUE)

#removeURL
removeURL <- function(x) gsub(“http[^[:space:]]*”, “”, x)
tt_corpus <- tm_map(tt_corpus, content_transformer(removeURL))
tt_corpus[[6]][1]

#remove /n
remove_n <- function(x) gsub(“\n”, “”, x)
tt_corpus <- tm_map(tt_corpus, content_transformer(remove_n))

#remove punctuation
removeNumPunct <- function(x) gsub(“[^[:alpha:][:space:]]*”, “”, x)
tt_corpus <- tm_map(tt_corpus, content_transformer(removeNumPunct))

#remove extra white space
tt_corpus <- tm_map(tt_corpus, stripWhitespace, lazy = TRUE)

#stopwords

mystopwords <- c(stopwords(‘en’), “will”, “amp”, “¼íºí”, “íºí”, “trump”, “today”, “just”, “now” )
tt_corpussw <- tm_map(tt_corpus, removeWords, mystopwords, lazy = TRUE)

#Document Term Matrix
tt_dtm_sw <- DocumentTermMatrix(tt_corpussw)
tt_dtm_sw

# make words into columns
tt_dtm_m_sw <- as.matrix(tt_dtm_sw)

#create a term document matrix
tt_tdm_sw <-TermDocumentMatrix(tt_corpussw)

tt_tdm_m_sw <- as.matrix(tt_tdm_sw )

#associations

findAssocs(tt_tdm_sw, “great”, 0.2)

findAssocs(tt_tdm_sw, “fake”, 0.2)

findAssocs(tt_tdm_sw, “failing”, 0.2)

findAssocs(tt_tdm_sw, “obama”, 0.2)

findAssocs(tt_tdm_sw, “russia”, 0.2)

findAssocs(tt_tdm_sw, “foxnews”, 0.2)

#plot association

source(“http://bioconductor.org/biocLite.R”)
biocLite(“Rgraphviz”)

freq.term15 <- findFreqTerms(tt_tdm_sw, lowfreq = 18)
freq.terms <- c(freq.term15, “cnn”)
plot(tt_tdm_sw, term = freq.terms, corThreshold = 0.1, weighting = T, attrs = list(graph = list(rankdir = “TB”), node = list(shape = “rectangle”, fixedsize = FALSE)))

Connor’s R script:

library(jsonlite)
library(dplyr)
library(ggplot2)

#Get data from Trump Twitter Archive and make dataframe from JSON data
trump <- fromJSON(“~/Text Mining with R/condensed_2017.json”)

#Convert id column to numeric
trump$id <- trump$id_str %>% as.numeric(length = 18)

#Filter tweets by first 100 days and non-RTs
trump100 <- filter(trump, id >= 822421390125043713 & id <= 858375971019399169)
trump100 <- filter(trump100, substr(text, start = 1, stop = 4) != “RT @”)

#Convert created_at column to date format as new column
trump100$created <- as.POSIXct(strptime(trump100$created_at, “%a %b %d %H:%M:%S %z %Y”))

#Function to filter tweets by term
FilterTerms <- function(source.df, term) {
mentions.df <- as.data.frame(grepl(paste(term, collapse = “|”), source.df$text, ignore.case = TRUE))
colnames(mentions.df) = “mention”
termmentions.df <- dplyr::bind_cols(mentions.df, source.df)
termmentions.df <- dplyr::filter(termmentions.df, mention == “TRUE”)
termmentions.df <- termmentions.df[, c(-1)]
return(termmentions.df)
}

#Filter tweets by topic
healthcare <- c(“health”, “obamacare”, “repeal”, “medical”)
immigration <- c(“immigration”, “wall”, “border”, “sanctuary”, “travel”, “ban”)
economy <- c(“trade”, “economy”, “jobs”)
news <- c(“failing”, “fake”, “msnbc”, “nyt”, “york”, “cnn”, “cbs”, “abc”, “nbc”, “buzzfeed”, “media”, “mainstream”)

trump_health_tweets <- FilterTerms(trump100, healthcare)
trump_immigration_tweets <- FilterTerms(trump100, immigration)
trump_economy_tweets <- FilterTerms(trump100, economy)
trump_news_tweets <- FilterTerms(trump100, news)

#Remove tweets that aren’t topical
trump_health_tweets <- trump_health_tweets[-c(5), ]
trump_immigration_tweets <- trump_immigration_tweets[-c(7, 12, 14, 20, 21, 41), ]
trump_economy_tweets <- trump_economy_tweets[-c(3), ]
trump_news_tweets <- trump_news_tweets[-c(8, 26, 27, 31, 32, 33, 38, 43, 54, 64, 69, 70, 71), ]

#Bind tweets and create a Policy column
trump_policy_tweets <- bind_rows(mutate(trump_health_tweets, Policy = “Healthcare”),
mutate(trump_immigration_tweets, Policy = “Immigration”),
mutate(trump_economy_tweets, Policy = “Economy”),
mutate(trump_news_tweets, Policy = “War on News”))

#Create density plot
b <- ggplot(trump_policy_tweets, aes(created)) +
geom_density(kernel = “gaussian”, aes(y = ..count.., group = Policy,
colour = Policy, fill = Policy), alpha = .1) +
theme(axis.line=element_blank(), axis.ticks.y = element_blank(),
axis.text.y=element_blank(),
axis.title.x=element_blank(),
axis.title.y=element_blank(),legend.position=”none”,
panel.background=element_blank(),panel.border=element_blank(),panel.grid.major=element_blank(),
panel.grid.minor=element_blank(),plot.background=element_blank())
direct.label(b, list(“top.bumptwice”, dl.trans(y = y + 0.2)))