Analysing twitter data with twitteR package

R code:

library(twitteR)
library(tm)
library(wordcloud)
library(stringr)
library(plyr)

#Log in to twitter (you can find these details on https://apps.twitter.com)
#Replace string between “” with the codes from your own twitter account.
consumer_key <- “Consumer Key (API Key)”
consumer_secret <- “Consumer Secret (API Secret)”
access_token <- “Access Token”
access_secret <- “Access Token Secret”

setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

#Some examples
userTimeline(“utwente”, n=10) # search for tweets of specific user
searchTwitter(“utwente”, n = 10) # search for keywords

#Discover twitteR object
?searchTwitter
tweetsList <- searchTwitter(“utwente”, n = 10)
tweet <- tweetsList[[1]]
tweet$getScreenName()
tweet$getText()
tweet$favoriteCount
tweet$retweetCount

#Harvest tweets based on keyword.
mach_tweets <- searchTwitter(“#prayforparis”, n=1500, lang=”en”)

#Extract the text from the tweets in a vector
#See http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/ for an
#approach in Windows.
mach_text <- sapply(mach_tweets, function(x) x$getText())
mach_text <- iconv(mach_text, to = “utf-8-mac”)

###Some initial cleaning
# Remove URLs
mach_text <- gsub(“(f|ht)(tp)(s?)(://)(.*)[.|/](.*)”, “”, mach_text, ignore.case = TRUE)

# Remove @UserName
#mach_text <- gsub(“@\\w+”, “”, mach_text)

# Create a corpus
mach_corpus <- Corpus(VectorSource(mach_text))

# create document term matrix applying some transformations
tdm <- TermDocumentMatrix(mach_corpus,
control = list(removePunctuation = TRUE,
stopwords = c(“prayforparis”, “paris”, “http”, “https”, stopwords(“english”)),
removeNumbers = TRUE, tolower = TRUE))
## further exploration termd document matrix
#frequent words
findFreqTerms(tdm, lowfreq = 100)

#association?
findAssocs(tdm, terms = “syria”, corlimit = 0.3)

## define tdm as matrix
tdMatrix <- as.matrix(tdm)

# get word counts in decreasing order
word_freqs <- sort(rowSums(tdMatrix), decreasing=TRUE)

# create a data frame with words and their frequencies
df <- data.frame(word=names(word_freqs), freq=word_freqs)

# plot wordcloud
pdf(“wcParis.pdf”)
wordcloud(df$word, df$freq, random.order=FALSE, colors=brewer.pal(8, “Dark2”), min.freq = 20)
dev.off()
####Sentiment analyses####
# based on https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107
# download Opinion Lexicon (Hu and Liu, KDD-2004) http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar

listPosWords <- scan(“positive-words.txt”, what = “character”, comment.char = “;”)
listNegWords <- scan(“negative-words.txt”, what = “character”, comment.char = “;”)

sentScoreTweets <- score.sentiment(mach_text, listPosWords, listNegWords, .progress = “text”)
hist(sentScoreTweets$score)

Leave a comment