library(tm)
## Loading required package: NLP
library(readr)
library(wordcloud)
## Loading required package: RColorBrewer
library(readxl)
library(readr)
covid04<-read.csv("covid04.csv")

Building the corpus

library(tm)
mydata<-Corpus(VectorSource(covid04$text))

##Cleaning

mydata <- tm_map(mydata, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(tolower)):
## transformation drops documents
mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(gsub), pattern = "\
## \W", : transformation drops documents
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeURL)):
## transformation drops documents
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeNumPunct)):
## transformation drops documents
mydata <- tm_map(mydata, removeWords, stopwords("italian"))
## Warning in tm_map.SimpleCorpus(mydata, removeWords, stopwords("italian")):
## transformation drops documents
myStopwords <- c(setdiff(stopwords('italian'), c("r")),"coronavirus", "covid")
mydata <- tm_map(mydata, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(mydata, removeWords, myStopwords): transformation
## drops documents
# remove extra whitespace
mydata <- tm_map(mydata, stripWhitespace)
## Warning in tm_map.SimpleCorpus(mydata, stripWhitespace): transformation drops
## documents
# Remove numbers
mydata <- tm_map(mydata, removeNumbers)
## Warning in tm_map.SimpleCorpus(mydata, removeNumbers): transformation drops
## documents
# Remove punctuations
mydata <- tm_map(mydata, removePunctuation)
## Warning in tm_map.SimpleCorpus(mydata, removePunctuation): transformation drops
## documents
April<-writeCorpus(mydata)

TermDocument Matrix

tdm<-TermDocumentMatrix(mydata)

# transform tdm into a matrix
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d)
##                      word freq
## radiosavana   radiosavana 3626
## italia             italia 2158
## conte               conte 1506
## oggi                 oggi 1359
## tutta               tutta 1335
## commercianti commercianti  998

##Visualization

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,        
        col =heat.colors(10), main ="Most frequent words",        
        ylab = "Word frequencies", cex.names=0.8)

##Hastaghs

library(quanteda)
## Package version: 2.0.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-
## The following object is masked from 'package:utils':
## 
##     View
library(readtext)
covid04$text<-as.character(covid04$text, encoding="UTF-8")
mycorp<-corpus(covid04$text)
tweet_dfm <- dfm(mycorp, remove_punct = TRUE)
dfm<- dfm(mycorp)
tag_dfm <- dfm_select(tweet_dfm, pattern = ("#*"))
tag_fcm <- fcm(tag_dfm)
toptag <- names(topfeatures(tag_dfm, 50))
topgat_fcm <- fcm_select(tag_fcm, pattern = toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)
## Registered S3 method overwritten by 'network':
##   method            from    
##   summary.character quanteda