Twitter Sentiment Analysis

Twitter sentiment, separated by region and candidate.

Outline


  1. Setup
  2. Stream Data
  3. Collect Data (Biden)
  4. Clean Data (Biden)
  5. Classify Data (Biden)
  6. Find Sentiment in Data (Biden)
  7. Collect Data (Trump)
  8. Clean Data (Trump)
  9. Classify Data (Trump)
  10. Find Sentiment in Data (Trump)
  11. Aggregare Sentiments
  12. A Gradient Map
  13. A Polarized Map
  14. Data Wrangling for Animations
  15. Graphing for Animations
  16. Animations

Raw code


streamtime <- 0.2 * 60 #set time out in minutes 
tweets_per_state <- 5 #number of tweets per state to be analyzed.

# Section One - Setup ----
# install.packages("textdata")
# install.packages("tidytext")
# install.packages("rtweet")
# install.packages("tidyverse")
# install.packages("plotly")
# install.packages("extrafont")
# install.packages("gganimate")
# install.packages("gifski")

library(tidyverse)
library(rtweet)
library(textdata)
library(tidytext)
library(plotly)
library(extrafont)
library(gganimate)
library(gifski)
library(readxl)
# Section Two - Stream Data ----

# no upfront-parse, save as json file
filenames <- c("stream_data_unparsed_b.json", "stream_data_raw_unparsed_t.json")
q <- c("biden", "trump")

for (i in 1:2) {
  stream_tweets(
    q = q[i],
    #is_quote = "FALSE", #could be activated for removing quotes
    parse = FALSE,
    timeout = streamtime,
    file_name = filenames[i]
  ) 
}
# Section Three - Collect Data (Biden) ----

data_b <- parse_stream(filenames[1]) #parse from json file
user_data_b <- users_data(data_b) #get metadata for users

# code below was used to check for the unique identifier to use, we see that both
# screen_name and user_id can be used going forward, user_id was chosen.

# length(unique(data_b$screen_name))
# length(unique(data_b$user_id))

# code below was used to check where location info was most available,
# either in "location" or "quoted_location".

# location_availability_1 <- user_data_b %>% 
#  select("user_id", "location")

# location_availability_2 <- data_b %>% 
#  select("user_id", "quoted_location")

# location_availability_1_na <- remove_missing(location_availability_1) 
# location_availability_2_na <- remove_missing(location_availability_2) 

# retained_location_data_1 <- count(location_availability_1_na)/count(location_availability_1) #retains ~58% for location info not equal to N/A
# retained_location_data_2 <- count(location_availability_2_na)/count(location_availability_2) #retains ~23% for location info not equal to N/A

# merge tibbles to have a unified one with all variables
user_data_b <- user_data_b %>% 
  distinct(user_id, .keep_all = TRUE)

data_b <- data_b %>% 
  distinct(user_id, .keep_all = TRUE)

data_b <- inner_join(x=user_data_b, y=data_b, by=c("user_id" = "user_id")) %>% 
  select("user_id", "location.x", "text", "lang") %>%
  rename(location = "location.x") %>% 
  filter(lang %in% c("en", "und")) %>% #only tweets in english 
  remove_missing() #only complete observations

# Section Four - Clean Data (Biden) ----

# clean location vector
location_vector_b <- pull(data_b, location)

location_vector_b <- location_vector_b %>% 
  str_trim(side = "both") %>% 
  str_to_lower() %>% 
  str_squish()

data_b <- data_b %>% 
  mutate(location = location_vector_b) #add back the cleaned vector

# clean text vector
data_b$text <- data_b$text %>% str_to_lower()
data_b$text <- gsub("http.*", "", data_b$text) #remove any links
data_b$text <- gsub("https.*", "", data_b$text) #remove any links
data_b$text <- gsub("&amp;", "&", data_b$text) #remove any links
data_b$text <- gsub("\\$", "", data_b$text) #remove "$" since special character in R

data_b <- data_b %>%
  filter(!str_detect(text, "trump")) #filter out tweets containing trump

# Section Five - Classify Data (Biden) ----

state_ab <- str_to_lower(state.abb)
state_name <- str_to_lower(state.name)

# initiating tibble
tweet_matrix_b <- as_tibble(data.frame(matrix(nrow=as.numeric(count(data_b)),ncol=length(state_ab))))
colnames(tweet_matrix_b) <- state_ab

# creates an identification matrix, showing where data is available
for (i in 1:length(tweet_matrix_b)) {
  tweet_matrix_b[i] <- 
    if_else(
      str_detect(pull(data_b, location), state_name[i]) |
        str_detect(pull(data_b, location), state_ab[i])
      == TRUE,
      "y",
      "n"
    )
}

# uses the identification matrix as a key to build a matrix of tweets
for (j in 1:length(tweet_matrix_b)) {
  for (i in 1:as.numeric(count(tweet_matrix_b))) {
    tweet_matrix_b[i,j] <- 
      if_else(
        as.character(tweet_matrix_b[i,j])
        == "y",
        as.character(data_b[i,3]), #the "3" will have to be changed if more variables are selected in creation of "data_b"
        "null"
      )
  }
}

# the code below can be used to check the number of observations actually collected for the candidate, but is not essential.

# biden_observations <- as_tibble(data.frame(matrix(nrow=1,ncol=length(state_ab)))) #initiating tbl for number of observations.
# colnames(biden_observations) <- state_ab
# 
# # this loop counts the number of observations for biden for each us state and puts them into a matrix.
# for (i in 1:50) {biden_observations[,i] <-
#   as.numeric(count(local_var <- tweet_matrix_b %>%
#                      filter(tweet_matrix_b[,i] != "null")))
# }
# biden_observations
# Section Six - Find Sentiment in Data (Biden) ----

# adjusting bing lexicon
bing_lex <- get_sentiments("bing") %>%
  filter(word != "trump")

# initiating matrices for tokens and sentiment
sentiment_b <- as_tibble(data.frame(matrix(nrow=1,ncol=length(state_ab))))
sentiment_b[,1:50] <- 0
colnames(sentiment_b) <- state_ab
column_local <- tibble()
tokens_local <- tibble()

# extract tokens & evaluate sentiment
time_before <- Sys.time() #this loop tends to be time consuming for large amounts of tweets
for (j in 1:50) { 
  column_local <- tweet_matrix_b[,j] #column_local is used to select each individual us state from the tweet matrix.
  colnames(column_local) <- "column_name"
  column_local <- filter(column_local, column_name != "null") #remove "null" rows.
  if(as.integer(count(column_local)) < tweets_per_state){
    while (as.integer(count(column_local)) < tweets_per_state) { #this is for redundancy, if the streamtime is short each state might not get enough tweets for sentiment analysis.
      column_local <- add_row(column_local, column_name = "notaword")
    }
  }
  for (i in 1:tweets_per_state) {
    tokens_local <- column_local[i,1] #tokens_local is used to extract the tokens of each individual tweet for each us state.
    colnames(tokens_local) <- "column_name"
    tokens_local <- unnest_tokens(tokens_local, word, "column_name")
    tokens_local <- inner_join(x = tokens_local, y= bing_lex, by = "word") #this is the matching with the "bing" lexicon.
    tokens_local <- tokens_local %>% 
      count(sentiment) %>% 
      spread(sentiment, n, fill = 0)
    if(count(tokens_local) == 0){ #the following if statements are to ensure that each tokens_local has the correct columns.
      tokens_local <- mutate(tokens_local, negative = 0, positive = 0); #if sentiment is missing it is set to zero.
      tokens_local <- add_row(tokens_local, negative = 0, positive = 0)
    }
    if(!exists("negative", tokens_local)){
      tokens_local <- mutate(tokens_local, negative = 0)
    }
    if(!exists("positive", tokens_local)){
      tokens_local <- mutate(tokens_local, positive = 0)
    }
    tokens_local <- mutate(tokens_local, sentiment = positive - negative) #sentiment is positive-negative.
    if(tokens_local$sentiment > 0){ #normalize each user's sentiment to ± 1
      tokens_local$sentiment = 1
    }
    if(tokens_local$sentiment < 0){ #normalize each user's sentiment to ± 1
      tokens_local$sentiment = -1
    }
    sentiment_b[i,j] <- tokens_local$sentiment #this is the final matrix with individual sentiment.
  }
}
time_after <- Sys.time()
time_elapsed <- time_after - time_before
time_elapsed

# Section Seven - Collect Data (Trump) ----

data_t <- parse_stream(filenames[2]) #parse from json file
user_data_t <- users_data(data_t) #get metadata for users

# merge tibbles to have a unified one with all variables
user_data_t <- user_data_t %>% 
  distinct(user_id, .keep_all = TRUE)

data_t <- data_t %>% 
  distinct(user_id, .keep_all = TRUE)

data_t <- inner_join(x=user_data_t, y=data_t, by=c("user_id" = "user_id")) %>% 
  select("user_id", "location.x", "text", "lang") %>% #possible extensions, "created_at", "name.x"
  rename(location = "location.x") %>% 
  filter(lang %in% c("en", "und")) %>% #only tweets in english 
  remove_missing() #only complete observations

# Section Eight - Clean Data (Trump) ----

# clean location vector
location_vector_t <- pull(data_t, location)

location_vector_t <- location_vector_t %>% 
  str_trim(side = "both") %>% 
  str_to_lower() %>% 
  str_squish()

data_t <- data_t %>% 
  mutate(location = location_vector_t) #add back the cleaned vector

# clean text vector
data_t$text <- data_t$text %>% str_to_lower()
data_t$text <- gsub("http.*", "", data_t$text) #remove any links
data_t$text <- gsub("https.*", "", data_t$text) #remove any links
data_t$text <- gsub("&amp;", "&", data_t$text) #remove any links
data_t$text <- gsub("\\$", "", data_t$text) #remove "$" since special character in R

data_t <- data_t %>%
  filter(!str_detect(text, "biden")) #filter out tweets containing biden

# Section Nine - Classify Data (Trump) ----
# categorization of tweets into US States

# initiating tibble
tweet_matrix_t <- as_tibble(data.frame(matrix(nrow=as.numeric(count(data_t)),ncol=length(state_ab))))
colnames(tweet_matrix_t) <- state_ab

# creates an identification matrix, showing where data is available
for (i in 1:length(tweet_matrix_t)) {
  tweet_matrix_t[i] <- 
    if_else(
      str_detect(pull(data_t, location), state_name[i]) |
        str_detect(pull(data_t, location), state_ab[i])
      == TRUE,
      "y",
      "n"
    )
}

# uses the identification matrix as a key to build a matrix of tweets
for (j in 1:length(tweet_matrix_t)) {
  for (i in 1:as.numeric(count(tweet_matrix_t))) {
    tweet_matrix_t[i,j] <- 
      if_else(
        as.character(tweet_matrix_t[i,j])
        == "y",
        as.character(data_t[i,3]), #the "3" will have to be changed if more variables are selected in creation of "data_b"
        "null"
      )
  }
}
# the code below can be used to check the number of observations actually collected for the candidate, but is not essential.

# trump_observations <- as_tibble(data.frame(matrix(nrow=1,ncol=length(state_ab)))) #initiating tbl for number of observations.
# colnames(trump_observations) <- state_ab
# 
# # this loop counts the number of observations for trump for each us state and puts them into a matrix.
# for (i in 1:50) {
#   trump_observations[,i] <- 
#     as.numeric(count(local_var <- tweet_matrix_t %>% 
#                        filter(tweet_matrix_t[,i] != "null")))
# }
# trump_observations
# Section Ten - Find Sentiment in Data (Trump) ----

# adjusting bing lexicon
bing_lex <- get_sentiments("bing") %>%
  filter(word != "trump")

# initiating matrices for tokens and sentiment
sentiment_t <- as_tibble(data.frame(matrix(nrow=1,ncol=length(state_ab))))
sentiment_t[,1:50] <- 0
colnames(sentiment_t) <- state_ab
column_local <- tibble()
tokens_local <- tibble()

# extract tokens & evaluate sentiment
time_before <- Sys.time() #this loop tends to be time consuming for large amounts of tweets
for (j in 1:50) { 
  column_local <- tweet_matrix_t[,j] #column_local is used to select each individual us state from the tweet matrix.
  colnames(column_local) <- "column_name"
  column_local <- filter(column_local, column_name != "null") #remove "null" rows.
  if(as.integer(count(column_local)) < tweets_per_state){
    while (as.integer(count(column_local)) < tweets_per_state) { #this is for redundancy, if the streamtime is short each state might not get enough tweets for sentiment analysis.
      column_local <- add_row(column_local, column_name = "notaword")
    }
  }
  for (i in 1:tweets_per_state) {
    tokens_local <- column_local[i,1] #tokens_local is used to extract the tokens of each individual tweet for each us state.
    colnames(tokens_local) <- "column_name"
    tokens_local <- unnest_tokens(tokens_local, word, "column_name")
    tokens_local <- inner_join(x = tokens_local, y= bing_lex, by = "word") #this is the matching with the "bing" lexicon.
    tokens_local <- tokens_local %>% 
      count(sentiment) %>% 
      spread(sentiment, n, fill = 0)
    if(count(tokens_local) == 0){ #the following if statements are to ensure that each tokens_local has the correct columns.
      tokens_local <- mutate(tokens_local, negative = 0, positive = 0); #if sentiment is missing it is set to zero.
      tokens_local <- add_row(tokens_local, negative = 0, positive = 0)
    }
    if(!exists("negative", tokens_local)){
      tokens_local <- mutate(tokens_local, negative = 0)
    }
    if(!exists("positive", tokens_local)){
      tokens_local <- mutate(tokens_local, positive = 0)
    }
    tokens_local <- mutate(tokens_local, sentiment = positive - negative) #sentiment is positive-negative.
    if(tokens_local$sentiment > 0){ #normalize each user's sentiment to ± 1
      tokens_local$sentiment = 1
    }
    if(tokens_local$sentiment < 0){ #normalize each user's sentiment to ± 1
      tokens_local$sentiment = -1
    }
    sentiment_t[i,j] <- tokens_local$sentiment #this is the final matrix with individual sentiment.
  }
}
time_after <- Sys.time()
time_elapsed <- time_after - time_before
time_elapsed

# Section Eleven - Aggregate Sentiments ----

sentiment <- sentiment_b*0 #initiating matrix for sentiment

# this loop aggregates the sentiments of both candidates.
# from before, the sentiment of each tweet has been normalized to ±1.
for (j in 1:50) {
  for (i in 1:as.numeric(count(sentiment_b))) {
    sentiment[i,j] <- sentiment[i,j] + sentiment_t[i,j] - sentiment_b[i,j]
    sentiment[i+1,j] <- sentiment[i,j]
  }
}
sentiment <- sentiment[-as.numeric(count(sentiment)),] #remove last duplicate row

sentiment <- sentiment %>%
  mutate(tweet_count = 1:as.numeric(count(sentiment))) %>% 
  pivot_longer(state_ab[], names_to = "state") %>%
  rename(Sentiment = "value") %>% 
  mutate(tooltip = paste0(str_to_title(state_name), "\n", Sentiment))

sentiment$state <- sentiment$state %>% 
  str_to_upper()

# Section Twelve - Gradient Map ----
fontstyle = list(
  family = "Space Mono",
  size = 10,
  color = "black"
)

label = list(
  bgcolor = "#FFFFFF",
  bordercolor = "transparent",
  font = fontstyle
)

us_graph = plot_geo(sentiment,
                    locationmode = "USA-states",
                    frame = ~tweet_count) %>% #set the bottom slider for the map.
  add_trace(locations = ~state,
            z = ~Sentiment,
            zmin = min(sentiment$Sentiment),
            zmax = -min(sentiment$Sentiment),
            color = ~Sentiment,
            colorscale = "RdBu", #alternative: Bluered, RdBu
            text = ~tooltip,
            hoverinfo = "text") %>% 
  layout(geo = list(scope = "usa"),
         font = list(family = "Space Mono"),
         title = "U.S Electoral Outcome\n(Aggregated Tweets)") %>% 
  style(hoverlabel = label) %>% 
  colorbar(
    outlinecolor = "#000000",
    y = 0.8,
    thickness = 25,
    len = 0.5,
    title = list(text = "Sentiment"))

us_graph
# Section Thirteen - Polarized Map ----
us_graph_polarized = plot_geo(sentiment,
                              locationmode = "USA-states",
                              frame = ~tweet_count) %>% #add bottom slider for map.
  add_trace(locations = ~state,
            z = ~Sentiment,
            zmin = -1,
            zmax = 1,
            color = ~Sentiment,
            colorscale = "RdBu", #alternative: Bluered, RdBu
            text = ~tooltip,
            hoverinfo = "text") %>% 
  layout(
    geo = list(scope = "usa"),
    font = list(family = "Space Mono"),
    title = "U.S Electoral Outcome\n(Aggregated Tweets)") %>% 
  style(hoverlabel = label) %>% 
  colorbar(
    outlinecolor = "#000000",
    y = 0.8,
    thickness = 25,
    len = 0.5,
    tickmode = "array",
    tickvals = c(-0.7,0,0.7),
    ticktext = c("Biden",0,"Trump"),
    title = list(text = "Sentiment"))

us_graph_polarized
# Section Fourteen - Data Wrangling for Animation ----

# data is imported for number of electoral votes per state, which will be the size in the animation.
electoral_votes <- read_excel("~/Documents/UiO/Autumn 2020/Data Science.ECON4170/Project/electoral_votes.xlsx")
electoral_votes <- electoral_votes %>%
  rename(state = State) %>% 
  mutate(region = as.character(state.region))
electoral_votes$state <- str_to_upper(state_ab)

sentiment_aggregated_b <- sentiment_b*0 # initiating matrix for aggregated biden series
sentiment_aggregated_t <- sentiment_t*0 # initiating matrix for aggregated trump series
sentiment_animation <- tibble()

# this loop is replacing the loop in section eleven.
# we now want to keep the sentiment of each of the candidate separate to see the evolution over time.
# it will be used as animation axis.
for (j in 1:50) {
  for (i in 1:as.numeric(count(sentiment_b))) {
    sentiment_aggregated_b[i,j] <- sentiment_aggregated_b[i,j] + sentiment_b[i,j]
    sentiment_aggregated_b[i+1,j] <- sentiment_aggregated_b[i,j]
  }
}

for (j in 1:50) {
  for (i in 1:as.numeric(count(sentiment_t))) {
    sentiment_aggregated_t[i,j] <- sentiment_aggregated_t[i,j] + sentiment_t[i,j]
    sentiment_aggregated_t[i+1,j] <- sentiment_aggregated_t[i,j]
  }
}

sentiment_aggregated_b <- sentiment_aggregated_b[-as.numeric(count(sentiment_aggregated_b)),] #remove last duplicate row
sentiment_aggregated_t <- sentiment_aggregated_t[-as.numeric(count(sentiment_aggregated_t)),] #remove last duplicate row

sentiment_aggregated_b <- sentiment_aggregated_b %>%
  mutate(tweet_count = 1:as.numeric(count(sentiment_aggregated_b))) %>% 
  pivot_longer(state_ab[], names_to = "state") %>%
  rename(Biden = "value")

sentiment_aggregated_t <- sentiment_aggregated_t %>%
  mutate(tweet_count = 1:as.numeric(count(sentiment_aggregated_t))) %>% 
  pivot_longer(state_ab[], names_to = "state") %>%
  rename(Trump = "value")

sentiment_aggregated_b$state <- sentiment_aggregated_b$state %>% 
  str_to_upper()
sentiment_aggregated_t$state <- sentiment_aggregated_t$state %>% 
  str_to_upper()

sentiment_animation <- sentiment_aggregated_b %>%
  mutate(Trump = sentiment_aggregated_t$Trump)

sentiment_animation <- left_join(
  x = sentiment_animation, 
  y = electoral_votes, 
  by = "state")

# Section Fifteen - Graphing for Animation ----

candidate_graph = sentiment_animation %>% 
  ggplot(aes(x = Trump,
             y = Biden,
             color = region,
             size = votes)) +
  geom_point(alpha = 0.5, stroke = 0) +
  xlim(min(sentiment_animation$Trump), -min(sentiment_animation$Trump)) +
  ylim(min(sentiment_animation$Biden), -min(sentiment_animation$Biden)) +
  scale_size_continuous(name = "Electoral Votes", range = c(3,10)) +
  scale_color_brewer(name = "U.S Region", guide = "legend", palette = "Set2") +
  labs(title = "Aggregated Sentiment of the 50 U.S States by Region",
       x = "Trump Sentiment",
       y = "Biden Sentiment") +
  guides(color = guide_legend(override.aes = list(size = 5))) +
  geom_vline(xintercept = 0, color = "black") +
  geom_hline(yintercept = 0, color = "black") +
  theme(panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        legend.position = "bottom",
        legend.box = "vertical",
        legend.margin = margin(),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5),
        plot.background = element_rect(fill = "#FFFFFF"),
        panel.background = element_rect(fill = "#FFFFFF"),
        legend.background = element_rect(fill = "#FFFFFF"))

candidate_graph
# Section Sixteen - Animation ----

candidate_graph_anim <- candidate_graph +
  transition_time(tweet_count) +
  labs(subtitle = "Number of Tweets: {frame_time}") + 
  shadow_wake(wake_length = 0.1)

animate(candidate_graph_anim, height = 500, width = 800, fps = 30,
        duration = 10, end_pause = 60, res = 100)

# optional for saving the rendered animation as a .gif file. 
# anim_save("candidate_sentiment.gif")
Adrian Evertsson
Adrian Evertsson
Economist

My research interests include macroeconometrics, economic growth theory and data science.