library(tm)
## Loading required package: NLP
library(readr)
library(wordcloud)
## Loading required package: RColorBrewer
library(readxl)
library(readr)
covid04<-read.csv("covid04.csv")
library(tm)
mydata<-Corpus(VectorSource(covid04$text))
##Cleaning
mydata <- tm_map(mydata, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(tolower)):
## transformation drops documents
mydata<-tm_map(mydata, content_transformer(gsub), pattern="\\W",replace=" ")
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(gsub), pattern = "\
## \W", : transformation drops documents
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeURL)):
## transformation drops documents
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
mydata <- tm_map(mydata, content_transformer(removeNumPunct))
## Warning in tm_map.SimpleCorpus(mydata, content_transformer(removeNumPunct)):
## transformation drops documents
mydata <- tm_map(mydata, removeWords, stopwords("italian"))
## Warning in tm_map.SimpleCorpus(mydata, removeWords, stopwords("italian")):
## transformation drops documents
myStopwords <- c(setdiff(stopwords('italian'), c("r")),"coronavirus", "covid")
mydata <- tm_map(mydata, removeWords, myStopwords)
## Warning in tm_map.SimpleCorpus(mydata, removeWords, myStopwords): transformation
## drops documents
# remove extra whitespace
mydata <- tm_map(mydata, stripWhitespace)
## Warning in tm_map.SimpleCorpus(mydata, stripWhitespace): transformation drops
## documents
# Remove numbers
mydata <- tm_map(mydata, removeNumbers)
## Warning in tm_map.SimpleCorpus(mydata, removeNumbers): transformation drops
## documents
# Remove punctuations
mydata <- tm_map(mydata, removePunctuation)
## Warning in tm_map.SimpleCorpus(mydata, removePunctuation): transformation drops
## documents
April<-writeCorpus(mydata)
tdm<-TermDocumentMatrix(mydata)
# transform tdm into a matrix
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d)
## word freq
## radiosavana radiosavana 3626
## italia italia 2158
## conte conte 1506
## oggi oggi 1359
## tutta tutta 1335
## commercianti commercianti 998
##Visualization
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col =heat.colors(10), main ="Most frequent words",
ylab = "Word frequencies", cex.names=0.8)
##Hastaghs
library(quanteda)
## Package version: 2.0.1
## Parallel computing: 2 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
## The following object is masked from 'package:utils':
##
## View
library(readtext)
covid04$text<-as.character(covid04$text, encoding="UTF-8")
mycorp<-corpus(covid04$text)
tweet_dfm <- dfm(mycorp, remove_punct = TRUE)
dfm<- dfm(mycorp)
tag_dfm <- dfm_select(tweet_dfm, pattern = ("#*"))
tag_fcm <- fcm(tag_dfm)
toptag <- names(topfeatures(tag_dfm, 50))
topgat_fcm <- fcm_select(tag_fcm, pattern = toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, edge_size = 5)
## Registered S3 method overwritten by 'network':
## method from
## summary.character quanteda
#Users
user_dfm <- dfm_select(dfm, pattern = "@*")
topuser <- names(topfeatures(user_dfm, 50))
head(topuser)
## [1] "@radiosavana" "@cartabellotta" "@rtl1025" "@quirinale"
## [5] "@antoniosocci1" "@vaevictis"
user_fcm <- fcm(user_dfm)
user_fcm <- fcm_select(user_fcm, pattern = topuser)
textplot_network(user_fcm, min_freq = 0.1, edge_color = "orange", edge_alpha = 0.8, edge_size = 5)