Covid19 - Italians tweets (April 2020, a sample of 10,000 tweets)

library(tm)

## Loading required package: NLP

library(readr)
library(wordcloud)

## Loading required package: RColorBrewer

library(readxl)
library(readr)
covid04<-read.csv("covid04.csv")

Building the corpus

library(tm)
mydata<-Corpus(VectorSource(covid04$text))

##Cleaning

mydata <- tm_map(mydata, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(mydata, content_transformer(tolower)):
## transformation drops documents

mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")

## Warning in tm_map.SimpleCorpus(mydata, content_transformer(gsub), pattern = "\
## \W", : transformation drops documents

removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeURL)):
## transformation drops documents

removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeNumPunct))

## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeNumPunct)):
## transformation drops documents

mydata <- tm_map(mydata, removeWords, stopwords("italian"))

## Warning in tm_map.SimpleCorpus(mydata, removeWords, stopwords("italian")):
## transformation drops documents

myStopwords <- c(setdiff(stopwords('italian'), c("r")),"coronavirus", "covid")
mydata <- tm_map(mydata, removeWords, myStopwords)

## Warning in tm_map.SimpleCorpus(mydata, removeWords, myStopwords): transformation
## drops documents

# remove extra whitespace
mydata <- tm_map(mydata, stripWhitespace)

## Warning in tm_map.SimpleCorpus(mydata, stripWhitespace): transformation drops
## documents

# Remove numbers
mydata <- tm_map(mydata, removeNumbers)

## Warning in tm_map.SimpleCorpus(mydata, removeNumbers): transformation drops
## documents

# Remove punctuations
mydata <- tm_map(mydata, removePunctuation)

## Warning in tm_map.SimpleCorpus(mydata, removePunctuation): transformation drops
## documents

April<-writeCorpus(mydata)

TermDocument Matrix

tdm<-TermDocumentMatrix(mydata)

# transform tdm into a matrix
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d)

##                      word freq
## radiosavana   radiosavana 3626
## italia             italia 2158
## conte               conte 1506
## oggi                 oggi 1359
## tutta               tutta 1335
## commercianti commercianti  998

##Visualization

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,        
        col =heat.colors(10), main ="Most frequent words",        
        ylab = "Word frequencies", cex.names=0.8)

##Hastaghs

library(quanteda)

## Package version: 2.0.1

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-

## The following object is masked from 'package:utils':
## 
##     View

library(readtext)
covid04$text<-as.character(covid04$text, encoding="UTF-8")
mycorp<-corpus(covid04$text)
tweet_dfm <- dfm(mycorp, remove_punct = TRUE)
dfm<- dfm(mycorp)
tag_dfm <- dfm_select(tweet_dfm, pattern = ("#*"))
tag_fcm <- fcm(tag_dfm)
toptag <- names(topfeatures(tag_dfm, 50))
topgat_fcm <- fcm_select(tag_fcm, pattern = toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)

## Registered S3 method overwritten by 'network':
##   method            from    
##   summary.character quanteda

#Users

user_dfm <- dfm_select(dfm, pattern = "@*")
topuser <- names(topfeatures(user_dfm, 50))
head(topuser)

## [1] "@radiosavana"   "@cartabellotta" "@rtl1025"       "@quirinale"    
## [5] "@antoniosocci1" "@vaevictis"

user_fcm <- fcm(user_dfm)
user_fcm <- fcm_select(user_fcm, pattern = topuser)
textplot_network(user_fcm, min_freq = 0.1, edge_color = "orange", edge_alpha = 0.8, edge_size = 5)

Covid19 - Italians tweets (April 2020, a sample of 10,000 tweets)

Stella Iezzi

4/30/2020

Building the corpus

TermDocument Matrix