Sentiment Analysis on Twitter Data

Analyzing Twitter data

Extract tweets and followers from the Twitter website with R and the twitteR package

1. With the tm package, clean text by removing punctuation,

2. numbers, hyperlinks and stop words, followed by stemming and stem completion

3. Build a term-document matrix

4. Analyse topics with the topicmodels package

5. Analyse sentiment with the sentiment140 package

6. Analyse following/followed and retweeting relationships with the igraph package

There is an attached eBook on twitter analysis using R. Download

Please find the attached twitter sample data. Data

**********************************************************************************************************

install.packages(“twitteR”, dependencies = T)
library(“twitteR”)
library(“plyr”)
library(“httr”)
library(“stringr”)
library(“maps”)
library(“tm”)

## This will create one time authorization with twitter api
options(httr_oauth_cache=T)

## Following details should be taken from twitter.
api_key <- “DWZXXXXXXXXXU74rCth4jr2etg”

api_secret <- “JIdtf3CJblIsHTwq3oBfikeGONV0CldRqXXXXXXXX3VnE”

access_token <- “172317191-ZtOZMfrED5ICXXXXXXXXXXjactKGvyDx2”

access_token_secret <- “ElEwGFJxZ3WnDJXXXXXXXX2AaYfhkoQD1mOePWGqrj”

***********************************************************************************************************

## Pass the above credential

setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

tweet <- searchTwitter(“#EUref”, n=5000)   ## twitter Hashtag that needs to be analysed
class(tweet)
tweet_EUreferendum <- tweet <- searchTwitter(“#EUreferendum”, n=5000)

** Optional
# Use the searchTwitter function to only get tweets within 50 miles of Los Angeles
tweets_geolocated <- searchTwitter(“#EUref OR #EUreferendum OR #VoteLeave OR #Brexit”, n=5000, lang=”en”, geocode=”51.278236,-0.95171109,100mi”, since=”2016-05-10″)

#lang, geocode, since is optional. Tweet can also be fetched without specifying this parameter
tweets_geolocated.df <- twListToDF(tweets_geolocated)

# Extract Tweets text
tweets.text <- lapply(tweets_geolocated, function(t)t$getText())
class(tweets.text)

# Write the dataframe into local
write.csv(tweets_geolocated.df, file=’C:/Users/XXX/Desktop/tweets_geolocated.df.csv’, row.names=T)

#- Cleaning data using tm Package
mycorpus <- Corpus(VectorSource(tweets_geolocated.df$text))
x <- as.character(mycorpus)
str(mycorpus)

# Removing white spaces
mycorpus1 <- tm_map(mycorpus, stripWhitespace)

# Converting text to lower case
mycorpus2 <- tm_map(mycorpus1, tolower)

# Removing stopwords
mycorpus3 <- tm_map(mycorpus2, removeWords, stopwords(“english”))

# Removing Punctuation
mycorpus4 <- tm_map(mycorpus3, removePunctuation)

# Removing Numbers
mycorpus5 <- tm_map(mycorpus4, removeNumbers)

# Plain text documents

mycorpus6 <- tm_map(mycorpus5, PlainTextDocument)

write.csv(mycorpus6, file = “C:/Users/XXX/Desktop/mycorpus6”, row.names = F)

data_dtm1<-DocumentTermMatrix(mycorpus6)

inspect(data_dtm1)

#### Frequent Words coming in the text document #####
frequent<-findFreqTerms(data_dtm1,lowfreq =100,highfreq = Inf)
frequent
##### Generating Word Cloud #######
#############################
install.packages(“wordcloud”,dependencies = TRUE)
install.packages(“stringr”,dependencies = TRUE)
library(stringr)
library(wordcloud)
wordcloud::wordcloud(mycorpus6,max.words=50, random.order = FALSE)

pal <- brewer.pal(9,”YlGnBu”)
pal <- pal[-(1:4)]
set.seed(123)
wordcloud::wordcloud(words = mycorpus6,scale=c(5,0.1),max.words=100,
random.order=FALSE,rot.per=0.35, use.r.layout=FALSE, colors=pal)

### Counting the occurrence of the words in the document 
install.packages(“slam”, dependencies = TRUE)
library(slam)
freq<-colapply_simple_triplet_matrix(data_dtm1,FUN = sum)
freq
###Writing it out in csv 
write.csv(frequent,file = “C:/Users/XXX/Desktop/frequent_words_final.csv”)
write.csv(freq,file = “C:/Users/XXX/Desktop/total_words_count_final.csv”)

################ Sentiment Analysis ###############
############################################
library(syuzhet)
library(lubridate)
library(ggplot2)
library(scales)
library(reshape2)
library(dplyr)
sentiment<-get_nrc_sentiment(x)
sentiment
t<-as.matrix(sentiment)
write.csv(sentiment,”sentiment_score.csv”)

getwd()

######## Visualizing the sentiment score #########
#######################################

comments<-cbind(tweets_geolocated.df$text,sentiment)
sentimentTotals <- data.frame(colSums(sentiment[,c(1:8)]))
names(sentimentTotals) <- “count”
sentimentTotals <- cbind(“sentiment” = rownames(sentimentTotals), sentimentTotals)
rownames(sentimentTotals) <- NULL

ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
geom_bar(aes(fill = sentiment), stat = “identity”) +
theme(legend.position = “none”) +
xlab(“Sentiment”) + ylab(“Total Count”) + ggtitle(“Total Sentiment Score for all Tweets”)

Leave a Reply

Close Menu