Function to do sentiment analysis (on twitter data for example)

# based on https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107

score.sentiment = function(sentences, pos.words, neg.words, .progress=’none’)
{
require(plyr)
require(stringr)

# we got a vector of sentences. plyr will handle a list or a vector as an “l” for us
# we want a simple array of scores back, so we use “l” + “a” + “ply” = laply:
scores = laply(sentences, function(sentence, pos.words, neg.words) {

# clean up sentences with R’s regex-driven global substitute, gsub():
sentence = gsub(‘[[:punct:]]’, ”, sentence)
sentence = gsub(‘[[:cntrl:]]’, ”, sentence)
sentence = gsub(‘\\d+’, ”, sentence)
# and convert to lower case:
sentence = tolower(sentence)

# split into words. str_split is in the stringr package
word.list = str_split(sentence, ‘\\s+’)
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)

# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)

# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)

# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) – sum(neg.matches)

return(score)
}, pos.words, neg.words, .progress=.progress )

scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}

Advertisements

Analysing twitter data with twitteR package

R code:

library(twitteR)
library(tm)
library(wordcloud)
library(stringr)
library(plyr)

#Log in to twitter (you can find these details on https://apps.twitter.com)
#Replace string between “” with the codes from your own twitter account.
consumer_key <- “Consumer Key (API Key)”
consumer_secret <- “Consumer Secret (API Secret)”
access_token <- “Access Token”
access_secret <- “Access Token Secret”

setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

#Some examples
userTimeline(“utwente”, n=10) # search for tweets of specific user
searchTwitter(“utwente”, n = 10) # search for keywords

#Discover twitteR object
?searchTwitter
tweetsList <- searchTwitter(“utwente”, n = 10)
tweet <- tweetsList[[1]]
tweet$getScreenName()
tweet$getText()
tweet$favoriteCount
tweet$retweetCount

#Harvest tweets based on keyword.
mach_tweets <- searchTwitter(“#prayforparis”, n=1500, lang=”en”)

#Extract the text from the tweets in a vector
#See http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/ for an
#approach in Windows.
mach_text <- sapply(mach_tweets, function(x) x$getText())
mach_text <- iconv(mach_text, to = “utf-8-mac”)

###Some initial cleaning
# Remove URLs
mach_text <- gsub(“(f|ht)(tp)(s?)(://)(.*)[.|/](.*)”, “”, mach_text, ignore.case = TRUE)

# Remove @UserName
#mach_text <- gsub(“@\\w+”, “”, mach_text)

# Create a corpus
mach_corpus <- Corpus(VectorSource(mach_text))

# create document term matrix applying some transformations
tdm <- TermDocumentMatrix(mach_corpus,
control = list(removePunctuation = TRUE,
stopwords = c(“prayforparis”, “paris”, “http”, “https”, stopwords(“english”)),
removeNumbers = TRUE, tolower = TRUE))
## further exploration termd document matrix
#frequent words
findFreqTerms(tdm, lowfreq = 100)

#association?
findAssocs(tdm, terms = “syria”, corlimit = 0.3)

## define tdm as matrix
tdMatrix <- as.matrix(tdm)

# get word counts in decreasing order
word_freqs <- sort(rowSums(tdMatrix), decreasing=TRUE)

# create a data frame with words and their frequencies
df <- data.frame(word=names(word_freqs), freq=word_freqs)

# plot wordcloud
pdf(“wcParis.pdf”)
wordcloud(df$word, df$freq, random.order=FALSE, colors=brewer.pal(8, “Dark2”), min.freq = 20)
dev.off()
####Sentiment analyses####
# based on https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107
# download Opinion Lexicon (Hu and Liu, KDD-2004) http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar

listPosWords <- scan(“positive-words.txt”, what = “character”, comment.char = “;”)
listNegWords <- scan(“negative-words.txt”, what = “character”, comment.char = “;”)

sentScoreTweets <- score.sentiment(mach_text, listPosWords, listNegWords, .progress = “text”)
hist(sentScoreTweets$score)