Install Packages

install.packages("bindrcpp")
install.packages("gridExtra")
install.packages("reshape2")
install.packages("topicmodels")
install.packages("maps")
install.packages("ggraph")
install.packages("igraph")
install.packages("tm")
install.packages("NLP")
install.packages("wordcloud")
install.packages("RColorBrewer")
install.packages("SnowballC")
install.packages("tidytext")
install.packages("tidyquant")
install.packages("quantmod")
install.packages("TTR")
install.packages("PerformanceAnalytics")
install.packages("xts")
install.packages("zoo")
install.packages("lubridate")
install.packages("forcats")
install.packages("stringr")
install.packages("dplyr")
install.packages("purrr")
install.packages("readr")
install.packages("tidyr")
install.packages("tibble")
install.packages("ggplot2")
install.packages("tidyverse")
install.packages("rtweet")

Load Packages

library(rtweet)
library(tidyverse)
library(tidyquant)
library(ggplot2)
library(tidytext)
library(SnowballC)
library(wordcloud)
library(tm)
library(igraph)
library(ggraph)
library(maps)
library(topicmodels)
library(reshape2)
library(grid)
library(gridExtra)

Introduction to rtweet

create a token with the web browser method: create token and save it as an environment variable

Create an Access Token Tutorial http://rtweet.info/articles/auth.html

create_token(
  app = "sociolegaltech_R_analysis",
  consumer_key = "rpRXPSSHDwBejBMy2SDI8ikLJ",
  consumer_secret = "4VnZ6KorOEGi7mKc8UupIYFe8UVrkOgKt15OlidHzcJ9VTjyXK")
Waiting for authentication in browser...
Press Esc/Ctrl + C to abort
Authentication complete.
<Token>
<oauth_endpoint>
 request:   https://api.twitter.com/oauth/request_token
 authorize: https://api.twitter.com/oauth/authenticate
 access:    https://api.twitter.com/oauth/access_token
<oauth_app> sociolegaltech_R_analysis
  key:    rpRXPSSHDwBejBMy2SDI8ikLJ
  secret: <hidden>
<credentials> oauth_token, oauth_token_secret, user_id, screen_name
---

Search for tweets using a hashtag. name-of-dataframe <- search_tweets( "searchterm", n = numberoftweets, include_rts = falseortrue)

searched_tweets <- search_tweets(
  "#bitcoin", n = 18000, include_rts = FALSE
)

You will now have a data frame with tweets that have been scraped from twitter.

You can look at the dataframe by entering view(searched_tweets)

If you do not have any observations, something did not work correctly.

You can interact with the data frame, such as plotting a time series.

ts_plot(nameofdataframe, "time_interval") +
other options to make the plot

plot time series of tweets

ts_plot(searched_tweets, "3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of #legaltech Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

search for 10,000 tweets sent from the US

you can use the search_tweets fucntion for tweets within a specified geographic area, using a Google API key, to look up coordinates.

irvine_tweets <- search_tweets(
  "lang:en", geocode = lookup_coords("irvine, CA", "country:US", apikey ="AIzaSyA7Au-XiUS6BD1Yl1ylEfLhJ9zu8Wc1-nw"), n = 10000
)
install.packages("jsonlite")
Error in install.packages : Updating loaded packages
library(jsonlite)
bitcoin_tweets2 <- fromJSON('https://github.com/sociolegaltech/sociolegaltech.github.io/raw/master/bitcoin_tweets.json')

create lat/lng variables using all available tweet and profile geo-location data

irvine_tweets_latlon <- lat_lng(irvine_tweets)

plot state boundaries

par(mar = c(0, 0, 0, 0))
maps::map("county", regions="california,orange", lwd = .25)
with(irvine_tweets_latlon, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))

Get Tweets of A User

timeline_tweets <- get_timeline("sociolegaltech", includeRts=TRUE)

Get Tweets of A User

mentions_tweets <- get_mentions("sociolegaltech")

Get friends

friends <- get_friends("sociolegaltech")

Get Information on Friends

friends <- lookup_users(friends$user_id)

What languages do my friends speak?

friends %>%
  count(lang) %>%
  droplevels() %>%
  ggplot(aes(x = reorder(lang, desc(n)), y = n)) +
    geom_bar(stat = "identity", color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
    labs(x = "language ISO 639-1 code",
         y = "number of friends")

Who are my most “influential” friends (i.e. friends with the biggest network)?

friends %>%
  ggplot(aes(x = log2(friends_count))) +
    geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq() +
    labs(x = "log2 of number of friends",
         y = "density")

How active are my followers (i.e. how often do they tweet)

friends %>%
  mutate(date = as.Date(account_created_at, format = "%Y-%m-%d"),
         today = as.Date("2018-06-22", format = "%Y-%m-%d"),
         days = as.numeric(today - date),
         statusesCount_pDay = statuses_count / days) %>%
  ggplot(aes(x = log2(statusesCount_pDay))) +
    geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq()

Tiday Text Analysis

data(stop_words)

Unnest Words

friends_descr <- friends %>%
  unnest_tokens(word, description) %>%
  mutate(word_stem = wordStem(word)) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!grepl("\\.|http", word))

Most Commonly Used Words

friends_descr %>%
  count(word_stem, sort = TRUE) %>%
  filter(n > 20) %>%
  ggplot(aes(x = reorder(word_stem, n), y = n)) +
    geom_col(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    coord_flip() +
    theme_tq() +
    labs(x = "",
         y = "count of word stem in all followers' descriptions")

Word Cloud

friends_descr %>%
  count(word_stem) %>%
  mutate(word_stem = removeNumbers(word_stem)) %>%
  with(wordcloud(word_stem, n, max.words = 100, colors = palette_light()))

Word Pairs

friends_descr_ngrams <- friends %>%
  unnest_tokens(bigram, description, token = "ngrams", n = 2, collapse = FALSE) %>%
  filter(!grepl("\\.|http", bigram)) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

Count of Words

bigram_counts <- friends_descr_ngrams %>%
  count(word1, word2, sort = TRUE)

Graph

bigram_counts %>%
  filter(n > 10) %>%
  ggplot(aes(x = reorder(word1, -n), y = reorder(word2, -n), fill = n)) +
    geom_tile(alpha = 0.8, color = "white") +
    scale_fill_gradientn(colours = c(palette_light()[[1]], palette_light()[[2]])) +
    coord_flip() +
    theme_tq() +
    theme(legend.position = "right") +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
    labs(x = "first word in pair",
         y = "second word in pair")

Graph Word Pairs

bigram_graph <- bigram_counts %>%
  filter(n > 5) %>%
  graph_from_data_frame()

set.seed(1)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color =  palette_light()[1], size = 5, alpha = 0.8) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 0.5) +
  theme_void()

Negated Meanings

bigrams_separated <- friends %>%
  unnest_tokens(bigram, description, token = "ngrams", n = 2, collapse = FALSE) %>%
  filter(!grepl("\\.|http", bigram)) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(word1 == "not" | word1 == "no") %>%
  filter(!word2 %in% stop_words$word)

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()
not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
    geom_col(show.legend = FALSE) +
    scale_fill_manual(values = palette_light()) +
    labs(x = "",
         y = "Sentiment score * number of occurrences",
         title = "Words preceded by \"not\"") +
    coord_flip() +
    theme_tq()

Sentiment Analysis

What is the predominatant sentiment

friends_descr_sentiment <- friends_descr %>%
  left_join(select(bigrams_separated, word1, word2), by = c("word" = "word2")) %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  rename(nrc = sentiment.x, bing = sentiment.y) %>%
  mutate(nrc = ifelse(!is.na(word1), NA, nrc),
         bing = ifelse(!is.na(word1) & bing == "positive", "negative",
                       ifelse(!is.na(word1) & bing == "negative", "positive", bing)))
friends_descr_sentiment %>%
  filter(nrc != "positive") %>%
  filter(nrc != "negative") %>%
  gather(x, y, nrc, bing) %>%
  count(x, y, sort = TRUE) %>%
  filter(n > 10) %>%
  ggplot(aes(x = reorder(y, n), y = n)) +
    facet_wrap(~ x, scales = "free") +
    geom_col(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    coord_flip() +
    theme_tq() +
    labs(x = "",
         y = "count of sentiment in followers' descriptions")

Are followers’ descriptions mostly positive or negative?

friends_descr_sentiment %>%
  count(screen_name, word, bing) %>%
  group_by(screen_name, bing) %>%
  summarise(sum = sum(n)) %>%
  spread(bing, sum, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  ggplot(aes(x = sentiment)) +
    geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq()

Word Cloud

friends_descr_sentiment %>%
  count(word, bing, sort = TRUE) %>%
  acast(word ~ bing, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = palette_light()[1:2],
                   max.words = 100)

Topic Modeling

dtm_words_count <- friends_descr %>%
  mutate(word_stem = removeNumbers(word_stem)) %>%
  count(screen_name, word_stem, sort = TRUE) %>%
  ungroup() %>%
  filter(word_stem != "") %>%
  cast_dtm(screen_name, word_stem, n)

# set a seed so that the output of the model is predictable
dtm_lda <- LDA(dtm_words_count, k = 5, control = list(seed = 1234))

topics_beta <- tidy(dtm_lda, matrix = "beta")
p1 <- topics_beta %>%
  filter(grepl("[a-z]+", term)) %>% # some words are Chinese, etc. I don't want these because ggplot doesn't plot them correctly
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, color = factor(topic), fill = factor(topic))) +
    geom_col(show.legend = FALSE, alpha = 0.8) +
    scale_color_manual(values = palette_light()) +
    scale_fill_manual(values = palette_light()) +
    facet_wrap(~ topic, ncol = 5) +
    coord_flip() +
    theme_tq() +
    labs(x = "",
         y = "beta (~ occurrence in topics 1-5)",
         title = "The top 10 most characteristic words describe topic categories.")
user_topic <- tidy(dtm_lda, matrix = "gamma") %>%
  arrange(desc(gamma)) %>%
  group_by(document) %>%
  top_n(1, gamma)
p2 <- user_topic %>%
  group_by(topic) %>%
  top_n(10, gamma) %>%
  ggplot(aes(x = reorder(document, -gamma), y = gamma, color = factor(topic))) +
    facet_wrap(~ topic, scales = "free", ncol = 5) +
    geom_point(show.legend = FALSE, size = 4, alpha = 0.8) +
    scale_color_manual(values = palette_light()) +
    scale_fill_manual(values = palette_light()) +
    theme_tq() +
    coord_flip() +
    labs(x = "",
         y = "gamma\n(~ affiliation with topics 1-5)")

Map Your Grids

grid.arrange(p1, p2, ncol = 1, heights = c(0.7, 0.3))
sessionInfo()
---
title: "Legal Analytics Module Twitter"
output:
  html_notebook
---
Install Packages
```
install.packages("bindrcpp")
install.packages("gridExtra")
install.packages("reshape2")
install.packages("topicmodels")
install.packages("maps")
install.packages("ggraph")
install.packages("igraph")
install.packages("tm")
install.packages("NLP")
install.packages("wordcloud")
install.packages("RColorBrewer")
install.packages("SnowballC")
install.packages("tidytext")
install.packages("tidyquant")
install.packages("quantmod")
install.packages("TTR")
install.packages("PerformanceAnalytics")
install.packages("xts")
install.packages("zoo")
install.packages("lubridate")
install.packages("forcats")
install.packages("stringr")
install.packages("dplyr")
install.packages("purrr")
install.packages("readr")
install.packages("tidyr")
install.packages("tibble")
install.packages("ggplot2")
install.packages("tidyverse")
install.packages("rtweet")
```
Load Packages
```{r}
library(bindrcpp)
library(gridExtra)
library(reshape2)
library(topicmodels)
library(maps)
library(ggraph)
library(igraph)
library(tm)
library(NLP)
library(wordcloud)
library(RColorBrewer)
library(SnowballC)
library(tidytext)
library(tidyquant)
library(quantmod)
library(TTR)
library(PerformanceAnalytics)
library(xts)
library(zoo)
library(lubridate)
library(forcats)
library(stringr)
library(dplyr)
library(purrr)
library(readr)
library(tidyr)
library(tibble)
library(ggplot2)
library(tidyverse)
library(rtweet)
```
## Introduction to rtweet

create a token with the web browser method: create token and save it as an environment variable

Create an Access Token Tutorial
[http://rtweet.info/articles/auth.html](http://rtweet.info/articles/auth.html)

```{r}
create_token(
  app = "sociolegaltech_R_analysis",
  consumer_key = "rpRXPSSHDwBejBMy2SDI8ikLJ",
  consumer_secret = "4VnZ6KorOEGi7mKc8UupIYFe8UVrkOgKt15OlidHzcJ9VTjyXK")
```

 Search for tweets using a hashtag.
 ```
 name-of-dataframe <- search_tweets( "searchterm", n = numberoftweets, include_rts = falseortrue)
 ```
 
```{r}
searched_tweets <- search_tweets(
  "#bitcoin", n = 18000, include_rts = FALSE
)
```

You will now have a data frame with tweets that have been scraped from twitter. 

You can look at the dataframe by entering `view(searched_tweets)`

If you do not have any observations, something did not work correctly.

You can interact with the data frame, such as plotting a time series.

```
ts_plot(nameofdataframe, "time_interval") +
other options to make the plot
```

## plot time series of tweets
```{r}
ts_plot(searched_tweets, "3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of #legaltech Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )
```

## search for 10,000 tweets sent from the US
you can use the search_tweets fucntion for tweets within a specified geographic area, using a Google API key, to look up coordinates.
```{r}
irvine_tweets <- search_tweets(
  "lang:en", geocode = lookup_coords("irvine, CA", "country:US", apikey ="AIzaSyA7Au-XiUS6BD1Yl1ylEfLhJ9zu8Wc1-nw"), n = 10000
)
```

```{r}
install.packages("jsonlite")
library(jsonlite)
bitcoin_tweets <- fromJSON('https://github.com/sociolegaltech/sociolegaltech.github.io/raw/master/bitcoin_tweets.json')
```



## create lat/lng variables using all available tweet and profile geo-location data
```{r}
irvine_tweets_latlon <- lat_lng(irvine_tweets)
```
## plot state boundaries
```{r}
par(mar = c(0, 0, 0, 0))
maps::map("county", regions="california,orange", lwd = .25)
with(irvine_tweets_latlon, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))
```

## Get Tweets of A User
```{r}
timeline_tweets <- get_timeline("sociolegaltech", includeRts=TRUE)
```

## Get Tweets of A User
```{r}
mentions_tweets <- get_mentions("sociolegaltech")
```

## Get friends
```{r}
friends <- get_friends("sociolegaltech")
````

## Get Information on Friends
```{r}
friends <- lookup_users(friends$user_id)
```

## What languages do my friends speak?
```{r}
friends %>%
  count(lang) %>%
  droplevels() %>%
  ggplot(aes(x = reorder(lang, desc(n)), y = n)) +
    geom_bar(stat = "identity", color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
    labs(x = "language ISO 639-1 code",
         y = "number of friends")
```
## Who are my most “influential” friends (i.e. friends with the biggest network)?
```{r}
friends %>%
  ggplot(aes(x = log2(friends_count))) +
    geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq() +
    labs(x = "log2 of number of friends",
         y = "density")
```

## How active are my followers (i.e. how often do they tweet)
```{r}
friends %>%
  mutate(date = as.Date(account_created_at, format = "%Y-%m-%d"),
         today = as.Date("2018-06-22", format = "%Y-%m-%d"),
         days = as.numeric(today - date),
         statusesCount_pDay = statuses_count / days) %>%
  ggplot(aes(x = log2(statusesCount_pDay))) +
    geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq()
```

## Tiday Text Analysis

```{r}
data(stop_words)
```

## Unnest Words
```{r}
friends_descr <- friends %>%
  unnest_tokens(word, description) %>%
  mutate(word_stem = wordStem(word)) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!grepl("\\.|http", word))
```

## Most Commonly Used Words
```{r}
friends_descr %>%
  count(word_stem, sort = TRUE) %>%
  filter(n > 20) %>%
  ggplot(aes(x = reorder(word_stem, n), y = n)) +
    geom_col(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    coord_flip() +
    theme_tq() +
    labs(x = "",
         y = "count of word stem in all followers' descriptions")
```

## Word Cloud
```{r}
friends_descr %>%
  count(word_stem) %>%
  mutate(word_stem = removeNumbers(word_stem)) %>%
  with(wordcloud(word_stem, n, max.words = 100, colors = palette_light()))
```

## Word Pairs
```{r}
friends_descr_ngrams <- friends %>%
  unnest_tokens(bigram, description, token = "ngrams", n = 2, collapse = FALSE) %>%
  filter(!grepl("\\.|http", bigram)) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
```

## Count of Words
```{r}
bigram_counts <- friends_descr_ngrams %>%
  count(word1, word2, sort = TRUE)
```

## Graph 
```{r}
bigram_counts %>%
  filter(n > 10) %>%
  ggplot(aes(x = reorder(word1, -n), y = reorder(word2, -n), fill = n)) +
    geom_tile(alpha = 0.8, color = "white") +
    scale_fill_gradientn(colours = c(palette_light()[[1]], palette_light()[[2]])) +
    coord_flip() +
    theme_tq() +
    theme(legend.position = "right") +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
    labs(x = "first word in pair",
         y = "second word in pair")
```

# Graph Word Pairs
```{r}
bigram_graph <- bigram_counts %>%
  filter(n > 5) %>%
  graph_from_data_frame()

set.seed(1)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color =  palette_light()[1], size = 5, alpha = 0.8) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 0.5) +
  theme_void()
```

# Negated Meanings
```{r}
bigrams_separated <- friends %>%
  unnest_tokens(bigram, description, token = "ngrams", n = 2, collapse = FALSE) %>%
  filter(!grepl("\\.|http", bigram)) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(word1 == "not" | word1 == "no") %>%
  filter(!word2 %in% stop_words$word)

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(get_sentiments("afinn"), by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE) %>%
  ungroup()
```

```{r}
not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
    geom_col(show.legend = FALSE) +
    scale_fill_manual(values = palette_light()) +
    labs(x = "",
         y = "Sentiment score * number of occurrences",
         title = "Words preceded by \"not\"") +
    coord_flip() +
    theme_tq()
```

## Sentiment Analysis
## What is the predominatant sentiment
```{r}
friends_descr_sentiment <- friends_descr %>%
  left_join(select(bigrams_separated, word1, word2), by = c("word" = "word2")) %>%
  inner_join(get_sentiments("nrc"), by = "word") %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  rename(nrc = sentiment.x, bing = sentiment.y) %>%
  mutate(nrc = ifelse(!is.na(word1), NA, nrc),
         bing = ifelse(!is.na(word1) & bing == "positive", "negative",
                       ifelse(!is.na(word1) & bing == "negative", "positive", bing)))
```

```{r}
friends_descr_sentiment %>%
  filter(nrc != "positive") %>%
  filter(nrc != "negative") %>%
  gather(x, y, nrc, bing) %>%
  count(x, y, sort = TRUE) %>%
  filter(n > 10) %>%
  ggplot(aes(x = reorder(y, n), y = n)) +
    facet_wrap(~ x, scales = "free") +
    geom_col(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    coord_flip() +
    theme_tq() +
    labs(x = "",
         y = "count of sentiment in followers' descriptions")
```

## Are followers’ descriptions mostly positive or negative?
```{r}
friends_descr_sentiment %>%
  count(screen_name, word, bing) %>%
  group_by(screen_name, bing) %>%
  summarise(sum = sum(n)) %>%
  spread(bing, sum, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  ggplot(aes(x = sentiment)) +
    geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
    theme_tq()
```
## Word Cloud
```{r}
friends_descr_sentiment %>%
  count(word, bing, sort = TRUE) %>%
  acast(word ~ bing, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = palette_light()[1:2],
                   max.words = 100)
```

# Topic Modeling
```{r}
dtm_words_count <- friends_descr %>%
  mutate(word_stem = removeNumbers(word_stem)) %>%
  count(screen_name, word_stem, sort = TRUE) %>%
  ungroup() %>%
  filter(word_stem != "") %>%
  cast_dtm(screen_name, word_stem, n)

# set a seed so that the output of the model is predictable
dtm_lda <- LDA(dtm_words_count, k = 5, control = list(seed = 1234))

topics_beta <- tidy(dtm_lda, matrix = "beta")
```

```{r}
p1 <- topics_beta %>%
  filter(grepl("[a-z]+", term)) %>% # some words are Chinese, etc. I don't want these because ggplot doesn't plot them correctly
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta) %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, color = factor(topic), fill = factor(topic))) +
    geom_col(show.legend = FALSE, alpha = 0.8) +
    scale_color_manual(values = palette_light()) +
    scale_fill_manual(values = palette_light()) +
    facet_wrap(~ topic, ncol = 5) +
    coord_flip() +
    theme_tq() +
    labs(x = "",
         y = "beta (~ occurrence in topics 1-5)",
         title = "The top 10 most characteristic words describe topic categories.")
```

```{r}
user_topic <- tidy(dtm_lda, matrix = "gamma") %>%
  arrange(desc(gamma)) %>%
  group_by(document) %>%
  top_n(1, gamma)
```

```{r}
p2 <- user_topic %>%
  group_by(topic) %>%
  top_n(10, gamma) %>%
  ggplot(aes(x = reorder(document, -gamma), y = gamma, color = factor(topic))) +
    facet_wrap(~ topic, scales = "free", ncol = 5) +
    geom_point(show.legend = FALSE, size = 4, alpha = 0.8) +
    scale_color_manual(values = palette_light()) +
    scale_fill_manual(values = palette_light()) +
    theme_tq() +
    coord_flip() +
    labs(x = "",
         y = "gamma\n(~ affiliation with topics 1-5)")
```

## Map Your Grids
```{r}
grid.arrange(p1, p2, ncol = 1, heights = c(0.7, 0.3))
```

```{r}
sessionInfo()
```