Fallos en la clasificación

library(readr)
IRAhandle_tweets_1 <- read.csv("IRAhandle_tweets_1.csv", header = T)

IRAhandle_tweets_2 <- read_csv("IRAhandle_tweets_2.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )

## See spec(...) for full column specifications.

## Warning: 2042 parsing failures.
##   row        col           expected                                                                     actual                     file
##  3345 tco3_step1 1/0/T/F/TRUE/FALSE http://rus.myprintbar.ru                                                   'IRAhandle_tweets_2.csv'
##  4375 tco3_step1 1/0/T/F/TRUE/FALSE http://gamiliel.com/2012/06/05/john-8-the-adulterous-woman/                'IRAhandle_tweets_2.csv'
##  5187 tco3_step1 1/0/T/F/TRUE/FALSE http://www.meadowvalecrc.org/?powerpress_pinw=3162-podcast                 'IRAhandle_tweets_2.csv'
##  6714 tco3_step1 1/0/T/F/TRUE/FALSE http://www.meadowvalecrc.org/?powerpress_pinw=3321-podcast                 'IRAhandle_tweets_2.csv'
## 15483 tco3_step1 1/0/T/F/TRUE/FALSE http://www.wbaltv.com/national/uk-votes-to-leave-in-eu-referendum/40202614 'IRAhandle_tweets_2.csv'
## ..... .......... .................. .......................................................................... ........................
## See problems(...) for more details.

IRAhandle_tweets_3 <- read_csv("IRAhandle_tweets_3.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double()
## )
## See spec(...) for full column specifications.

IRAhandle_tweets_4 <- read_csv("IRAhandle_tweets_4.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   post_type = col_logical(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 99326 parsing failures.
##   row        col           expected                actual                     file
##  4038 tco3_step1 1/0/T/F/TRUE/FALSE http://Dictionary.com 'IRAhandle_tweets_4.csv'
##  4239 tco3_step1 1/0/T/F/TRUE/FALSE http://Backpage.com   'IRAhandle_tweets_4.csv'
##  5554 tco3_step1 1/0/T/F/TRUE/FALSE http://ABC7NEWS.COM   'IRAhandle_tweets_4.csv'
##  9972 post_type  1/0/T/F/TRUE/FALSE RETWEET               'IRAhandle_tweets_4.csv'
## 10183 post_type  1/0/T/F/TRUE/FALSE RETWEET               'IRAhandle_tweets_4.csv'
## ..... .......... .................. ..................... ........................
## See problems(...) for more details.

IRAhandle_tweets_5 <- read_csv("IRAhandle_tweets_5.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 305 parsing failures.
##  row        col           expected                          actual                     file
## 1084 tco3_step1 1/0/T/F/TRUE/FALSE http://vice.com                 'IRAhandle_tweets_5.csv'
## 3026 tco3_step1 1/0/T/F/TRUE/FALSE http://dlvr.it/Nk4PFy           'IRAhandle_tweets_5.csv'
## 3426 tco3_step1 1/0/T/F/TRUE/FALSE http://freecomicsonlinewoke.com 'IRAhandle_tweets_5.csv'
## 3828 tco3_step1 1/0/T/F/TRUE/FALSE https://youtu.be/1vWvl3sMPlg    'IRAhandle_tweets_5.csv'
## 4745 tco3_step1 1/0/T/F/TRUE/FALSE http://Refusefascism.org        'IRAhandle_tweets_5.csv'
## .... .......... .................. ............................... ........................
## See problems(...) for more details.

IRAhandle_tweets_6 <- read_csv("IRAhandle_tweets_6.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 1708 parsing failures.
##  row        col           expected                                                     actual                     file
## 1605 tco3_step1 1/0/T/F/TRUE/FALSE http://www.meadowvalecrc.org/?powerpress_pinw=3381-podcast 'IRAhandle_tweets_6.csv'
## 2500 tco3_step1 1/0/T/F/TRUE/FALSE http://gamiliel.com/2012/06/06/john-13-the-passover/       'IRAhandle_tweets_6.csv'
## 2819 tco3_step1 1/0/T/F/TRUE/FALSE http://gamiliel.com/2013/01/07/romans-7/                   'IRAhandle_tweets_6.csv'
## 2825 tco3_step1 1/0/T/F/TRUE/FALSE https://500px.com/gamilliell                               'IRAhandle_tweets_6.csv'
## 4061 tco3_step1 1/0/T/F/TRUE/FALSE http://www.meadowvalecrc.org/?powerpress_pinw=3303-podcast 'IRAhandle_tweets_6.csv'
## .... .......... .................. .......................................................... ........................
## See problems(...) for more details.

IRAhandle_tweets_7 <- read_csv("IRAhandle_tweets_7.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 383 parsing failures.
##  row        col           expected                                                                       actual                     file
## 2893 tco3_step1 1/0/T/F/TRUE/FALSE http://bit.ly/2ismFW6                                                        'IRAhandle_tweets_7.csv'
## 3022 tco3_step1 1/0/T/F/TRUE/FALSE http://kNOwBETTERHIPHOP.com                                                  'IRAhandle_tweets_7.csv'
## 3056 tco3_step1 1/0/T/F/TRUE/FALSE http://RightOnTVMobileApp.com                                                'IRAhandle_tweets_7.csv'
## 4200 tco3_step1 1/0/T/F/TRUE/FALSE http://kweliclub.com/products/walter-rodney-how-europe-underdeveloped-africa 'IRAhandle_tweets_7.csv'
## 5294 tco3_step1 1/0/T/F/TRUE/FALSE http://www.loonwatch.com/tag/sam-harris/                                     'IRAhandle_tweets_7.csv'
## .... .......... .................. ............................................................................ ........................
## See problems(...) for more details.

IRAhandle_tweets_8 <- read_csv("IRAhandle_tweets_8.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 285 parsing failures.
##   row        col           expected                                                                                   actual                     file
##  4681 tco3_step1 1/0/T/F/TRUE/FALSE http://rus.myprintbar.ru                                                                 'IRAhandle_tweets_8.csv'
##  5289 tco3_step1 1/0/T/F/TRUE/FALSE https://www.smashwords.com/books/view/474508                                             'IRAhandle_tweets_8.csv'
##  5564 tco3_step1 1/0/T/F/TRUE/FALSE http://patriotsunite.info                                                                'IRAhandle_tweets_8.csv'
## 13745 tco3_step1 1/0/T/F/TRUE/FALSE http://www.rosbalt.ru/piter/2015/06/18/1409874.html                                      'IRAhandle_tweets_8.csv'
## 17910 tco3_step1 1/0/T/F/TRUE/FALSE https://nevnov.ru/457531-aleksandr-perendzhiev-ozhidaem-popytku-gosperevorota-na-ukraine 'IRAhandle_tweets_8.csv'
## ..... .......... .................. ........................................................................................ ........................
## See problems(...) for more details.

IRAhandle_tweets_9 <- read_csv("IRAhandle_tweets_9.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   post_type = col_logical(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 67645 parsing failures.
##  row       col           expected  actual                     file
## 1036 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_9.csv'
## 1037 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_9.csv'
## 1061 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_9.csv'
## 1064 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_9.csv'
## 1069 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_9.csv'
## .... ......... .................. ....... ........................
## See problems(...) for more details.

IRAhandle_tweets_10 <- read_csv("IRAhandle_tweets_10.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   post_type = col_logical(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 89701 parsing failures.
##  row        col           expected                                            actual                      file
## 1100 post_type  1/0/T/F/TRUE/FALSE RETWEET                                           'IRAhandle_tweets_10.csv'
## 1344 post_type  1/0/T/F/TRUE/FALSE RETWEET                                           'IRAhandle_tweets_10.csv'
## 2019 tco3_step1 1/0/T/F/TRUE/FALSE https://www.ridus.ru/news/248567                  'IRAhandle_tweets_10.csv'
## 2163 tco3_step1 1/0/T/F/TRUE/FALSE https://www.ridus.ru/news/248770                  'IRAhandle_tweets_10.csv'
## 2165 tco3_step1 1/0/T/F/TRUE/FALSE http://vietnam.mid.ru/elektronnaa-viza-vo-v-etnam 'IRAhandle_tweets_10.csv'
## .... .......... .................. ................................................. .........................
## See problems(...) for more details.

IRAhandle_tweets_11 <- read_csv("IRAhandle_tweets_11.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   post_type = col_logical(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 66039 parsing failures.
##  row       col           expected  actual                      file
## 5247 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_11.csv'
## 5274 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_11.csv'
## 5606 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_11.csv'
## 5701 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_11.csv'
## 6011 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_11.csv'
## .... ......... .................. ....... .........................
## See problems(...) for more details.

IRAhandle_tweets_12 <- read_csv("IRAhandle_tweets_12.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   post_type = col_logical(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.

## Warning: 110978 parsing failures.
##   row       col           expected  actual                      file
## 14784 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_12.csv'
## 26336 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_12.csv'
## 27167 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_12.csv'
## 27168 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_12.csv'
## 27169 post_type 1/0/T/F/TRUE/FALSE RETWEET 'IRAhandle_tweets_12.csv'
## ..... ......... .................. ....... .........................
## See problems(...) for more details.

IRAhandle_tweets_13 <- read_csv("IRAhandle_tweets_13.csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double()
## )
## See spec(...) for full column specifications.

y_test <- read.csv("y_test.csv", header = FALSE)
y_pred <- read.csv("y_pred.csv")

names(y_test)[1] = "author"
names(y_test)[2] = "X0_test"
names(y_pred)[2] = "X0_pred"

total_y <- list(y_test,y_pred)
fail_accounts <- data.frame()
success_accounts <- data.frame()

for(i in 1:557){
  if(y_test[i,2] != y_pred[i,2])
    fail_accounts[i,1] <- y_pred[i,1]
  else
    success_accounts[i,1] <- y_pred[i,1]
}

fail_accounts <- na.omit(unique(fail_accounts))
success_accounts <- na.omit(unique(success_accounts))


names(fail_accounts) = "author"
names(success_accounts) = "author"

summary(fail_accounts)

##              author  
##  _RUBY_WILLS_   : 1  
##  _YOUR_LIFESTYLE: 1  
##  AFONINMIXAIL   : 1  
##  ALVA_MC_GHEE   : 1  
##  AMBBERTHTT     : 1  
##  ANCARICTRS     : 1  
##  (Other)        :79

summary(success_accounts)

##            author   
##  _RONBEN      :  1  
##  _SASHALAPIN  :  1  
##  4EVER1937    :  1  
##  AARONALLENALL:  1  
##  ABBYLOPTRT   :  1  
##  ABISADMASST  :  1  
##  (Other)      :466

fail_authors <- as.character(fail_accounts$author)
fail_authors <- sort(fail_authors, decreasing = FALSE)
fail_authors <- as.data.frame(fail_authors)
names(fail_authors)[1] = "author"

success_authors <- as.character(success_accounts$author)
success_authors <- sort(success_authors, decreasing = FALSE)
success_authors <- as.data.frame(success_authors)
names(success_authors)[1] = "author"

Se une todo el dataset original

total_tweets<- rbind(IRAhandle_tweets_1,IRAhandle_tweets_2,IRAhandle_tweets_3,IRAhandle_tweets_4,IRAhandle_tweets_5,IRAhandle_tweets_6,IRAhandle_tweets_7,IRAhandle_tweets_8,IRAhandle_tweets_9,IRAhandle_tweets_10,IRAhandle_tweets_11,IRAhandle_tweets_12,IRAhandle_tweets_13)

total_tweets <- data.frame(total_tweets$author,total_tweets$content, total_tweets$following, total_tweets$followers, total_tweets$updates)

names(total_tweets)[1] = "author"

Se hace un cruce entre el dataset original y el que contiene los autores de clases mal predichas para crear un data frame con los respectivos atributos de dichos autores

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.1       v purrr   0.3.1  
## v tibble  2.0.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v ggplot2 3.1.1       v forcats 0.4.0

## -- Conflicts ---------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

total_failed_tweets <- total_tweets %>% 
  right_join(fail_authors, total_failed_tweets, by="author")

## Warning: Column `author` joining factors with different levels, coercing to
## character vector

summary(total_failed_tweets)

##     author         
##  Length:155980     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##                    
##                                                    total_tweets.content
##  GUIDE: What's up in Pittsburgh this weekend?  #entertainment:    27   
##  Thursday on the Fan  #news                                  :    15   
##  Wednesday on the Fan  #news                                 :    14   
##  Friday on the Fan  #news                                    :    13   
##  GUIDE: What's up in Pittsburgh this weekend?  #local        :    13   
##  What's up in Pittsburgh this weekend?  #entertainment       :    12   
##  (Other)                                                     :155886   
##  total_tweets.following total_tweets.followers total_tweets.updates
##  Min.   :   -1          Min.   :    -1         Min.   :   -1       
##  1st Qu.:  664          1st Qu.:  1050         1st Qu.: 2466       
##  Median : 3210          Median :  5845         Median : 8770       
##  Mean   : 4297          Mean   : 10699         Mean   :12078       
##  3rd Qu.: 6355          3rd Qu.: 15087         3rd Qu.:18240       
##  Max.   :33666          Max.   :103197         Max.   :56470       
##

Ahora lo mismo pero con los autores que fueron bien clasificados:

library(tidyverse)
total_successful_tweets <- total_tweets %>% 
  right_join(success_authors, total_successful_tweets, by="author")

## Warning: Column `author` joining factors with different levels, coercing to
## character vector

summary(total_successful_tweets)

##     author         
##  Length:698202     
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##                    
##                                                                                                                                                                                                                                                                                                                                                                                          total_tweets.content
##  <U+0412> <U+0433><U+043E><U+0440><U+043E><U+0434><U+0435> <U+0421><U+043E><U+0447><U+0438>. <U+041E><U+043B><U+0438><U+043C><U+043F><U+0438><U+0430><U+0434><U+0430> – <U+043F><U+0440><U+0430><U+0437><U+0434><U+043D><U+0438><U+043A> <U+0438><U+043B><U+0438> <U+0441><U+0442><U+0438><U+0445><U+0438><U+0439><U+043D><U+043E><U+0435>...                                                      :   137   
##  Celebrity style: Red carpet looks  #celebs #news                                                                                                                                                                                                                                                                                                                                                  :    79   
##  Daily Celebrity Watch  #celebs #news                                                                                                                                                                                                                                                                                                                                                              :    72   
##  ...<U+0441><U+0442><U+0430><U+0434><U+0438><U+043E><U+043D>, <U+0423> <U+043D><U+0430><U+0441> <U+0441><U+0432><U+043E><U+044F> <U+043E><U+043B><U+0438><U+043C><U+043F><U+0438><U+0430><U+0434><U+0430> – <U+0417><U+0430> <U+043C><U+0430><U+043B><U+044B><U+0448><U+043E><U+043C> <U+0431><U+0440><U+043E><U+0441><U+043E><U+043A> <U+043F><U+043E><U+0434> <U+0441><U+0442><U+043E><U+043B>...:    53   
##  NewsOne Now Audio Podcast: Bishop E.W. Jackson Calls #BlackLivesMatter Is Movement “Disgraceful”                                                                                                                                                                                                                                                                                                  :    50   
##  <U+041B><U+043E><U+043D><U+0434><U+043E><U+043D> 2012 — <U+041E><U+043B><U+0438><U+043C><U+043F><U+0438><U+0430><U+0434><U+0430> <U+0410><U+043D><U+0442><U+0438><U+0445><U+0440><U+0438><U+0441><U+0442><U+0430>                                                                                                                                                                                 :    39   
##  (Other)                                                                                                                                                                                                                                                                                                                                                                                           :697772   
##  total_tweets.following total_tweets.followers total_tweets.updates
##  Min.   :    0          Min.   :    0          Min.   :    1       
##  1st Qu.:  429          1st Qu.:  291          1st Qu.: 1631       
##  Median : 1838          Median : 1521          Median : 4274       
##  Mean   : 3625          Mean   : 5078          Mean   : 8170       
##  3rd Qu.: 4848          3rd Qu.: 4369          3rd Qu.:11203       
##  Max.   :26371          Max.   :71022          Max.   :46749       
##

Wordcloud para los autores mal clasificados:

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(SnowballC)
library(RColorBrewer)
library(wordcloud)

total_failed_tweets.Corpus <- Corpus(VectorSource(total_failed_tweets$total_tweets.content))

total_failed_tweets.Clean <- tm_map(total_failed_tweets.Corpus, PlainTextDocument)

## Warning in tm_map.SimpleCorpus(total_failed_tweets.Corpus,
## PlainTextDocument): transformation drops documents

total_failed_tweets.Clean <- tm_map(total_failed_tweets.Corpus, tolower)

## Warning in tm_map.SimpleCorpus(total_failed_tweets.Corpus, tolower):
## transformation drops documents

total_failed_tweets.Clean <- tm_map(total_failed_tweets.Clean, removeNumbers)

## Warning in tm_map.SimpleCorpus(total_failed_tweets.Clean, removeNumbers):
## transformation drops documents

total_failed_tweets.Clean <- tm_map(total_failed_tweets.Clean, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(total_failed_tweets.Clean, removeWords,
## stopwords("english")): transformation drops documents

total_failed_tweets.Clean <- tm_map(total_failed_tweets.Clean,removePunctuation)

## Warning in tm_map.SimpleCorpus(total_failed_tweets.Clean,
## removePunctuation): transformation drops documents

total_failed_tweets.Clean <- tm_map(total_failed_tweets.Clean, stripWhitespace)

## Warning in tm_map.SimpleCorpus(total_failed_tweets.Clean, stripWhitespace):
## transformation drops documents

total_failed_tweets.Clean <- tm_map(total_failed_tweets.Clean, stemDocument)

## Warning in tm_map.SimpleCorpus(total_failed_tweets.Clean, stemDocument):
## transformation drops documents

wordcloud(total_failed_tweets.Clean, min.freq = 1, max.words = 50, random.order = FALSE, rot.per = 0.25, colors = brewer.pal(8, "Dark2"))

Wordcloud para los bien clasificados

library(tm)
library(SnowballC)
library(RColorBrewer)
library(wordcloud)

total_successful_tweets.Corpus <- Corpus(VectorSource(total_successful_tweets$total_tweets.content))

total_successful_tweets.Clean <- tm_map(total_successful_tweets.Corpus, PlainTextDocument)

## Warning in tm_map.SimpleCorpus(total_successful_tweets.Corpus,
## PlainTextDocument): transformation drops documents

total_successful_tweets.Clean <- tm_map(total_successful_tweets.Corpus, tolower)

## Warning in tm_map.SimpleCorpus(total_successful_tweets.Corpus, tolower):
## transformation drops documents

total_successful_tweets.Clean <- tm_map(total_successful_tweets.Clean, removeNumbers)

## Warning in tm_map.SimpleCorpus(total_successful_tweets.Clean,
## removeNumbers): transformation drops documents

total_successful_tweets.Clean <- tm_map(total_successful_tweets.Clean, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(total_successful_tweets.Clean,
## removeWords, : transformation drops documents

total_successful_tweets.Clean <- tm_map(total_successful_tweets.Clean,removePunctuation)

## Warning in tm_map.SimpleCorpus(total_successful_tweets.Clean,
## removePunctuation): transformation drops documents

total_successful_tweets.Clean <- tm_map(total_successful_tweets.Clean, stripWhitespace)

## Warning in tm_map.SimpleCorpus(total_successful_tweets.Clean,
## stripWhitespace): transformation drops documents

total_successful_tweets.Clean <- tm_map(total_successful_tweets.Clean, stemDocument)

## Warning in tm_map.SimpleCorpus(total_successful_tweets.Clean,
## stemDocument): transformation drops documents

wordcloud(total_successful_tweets.Clean, min.freq = 1, max.words = 50, random.order = FALSE, rot.per = 0.4, colors = brewer.pal(8, "Dark2"))

```

Se puede observar que los datos que se clasifican mal suelen tener palabras rusas con mayor frecuencia.

Fallos en la clasificación

Camila Villar Mascaró, Miguel Sepúlveda Huenchuleo, Felipe Canales

25 de junio de 2019