scrape for school1

Each interview contained in a table includes a summary. Using rvest I will scrape the page of all the interview text contained in the table. I will summarize just the interview’s summary. Using dplyr, tidytext and stringr, we can establish a word count of the summary text. To control the amount of words included, by using a stop word list and creating a custom list, we can filter purposeful words. The first word cloud shows what R output before the the stop word clean up and the second excludes words I selected. This exercise was intended to be practice. I also wanted to get a sense of the words describing these interviews. The next step would be a sentiment analysis. I decided to do that at the last minute. GGplot was needed and the library was read in later down the page.

library(rvest)

## Warning: package 'rvest' was built under R version 4.3.3

library(tidytext)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(purrr)

## Warning: package 'purrr' was built under R version 4.3.3

# Define the URL of the page to scrape
url <- "http://studsterkel.matrix.msu.edu/htimes.php"

# Read the HTML content from the URL
page <- read_html(url)

# Extract the table from the page
# The CSS selector might need adjustment based on the actual structure of the HTML
table <- page %>%
  html_node("td.sterkel") %>% 
  html_table(fill = TRUE)

# Display the first few rows of the table
str(table)

## tibble [720 × 15] (S3: tbl_df/tbl/data.frame)
##  $ X1 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "File Name:\n         \n\t \n        terkel-a0a0k0-a.mp3\n        Date:\n         1971\n        Summary:\n      "| __truncated__ "File Name:" "Date:" ...
##  $ X2 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "File Name:" "terkel-a0a0k0-a.mp3" "1971" ...
##  $ X3 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "terkel-a0a0k0-a.mp3" NA NA ...
##  $ X4 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "Date:" NA NA ...
##  $ X5 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "1971" NA NA ...
##  $ X6 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "Summary:" NA NA ...
##  $ X7 : chr [1:720] NA "A montage of unidentified, young voices: Children of people who went through the Depression and their experienc"| __truncated__ NA NA ...
##  $ X8 : chr [1:720] NA "Keywords:" NA NA ...
##  $ X9 : chr [1:720] NA "Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving" NA NA ...
##  $ X10: chr [1:720] NA "Interviewer(s):" NA NA ...
##  $ X11: chr [1:720] NA "Terkel, Studs" NA NA ...
##  $ X12: chr [1:720] NA "Interviewee(s):" NA NA ...
##  $ X13: chr [1:720] NA "" NA NA ...
##  $ X14: logi [1:720] NA NA NA NA NA NA ...
##  $ X15: logi [1:720] NA NA NA NA NA NA ...

tibble(table)

print(names(table))

##  [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
## [13] "X13" "X14" "X15"

print(head(table$X1))

## [1] "Interview with Children of People Who Went Through the Depression"                                                                                                                                                                                                                                                                                                                                                                                                                                                  
## [2] "File Name:\n         \n\t \n        terkel-a0a0k0-a.mp3\n        Date:\n         1971\n        Summary:\n         A montage of unidentified, young voices: Children of people who went through the Depression and their experience with their parents? frugality, fear, and economic striving.\n        Keywords:\n         \n           Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving         \n        Interviewer(s):\n         \n         Terkel, Studs\n        Interviewee(s):"
## [3] "File Name:"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
## [4] "Date:"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [5] "Summary:"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
## [6] "Keywords:"

print(head(table$X2))

## [1] "Interview with Children of People Who Went Through the Depression"                                                                                                           
## [2] "File Name:"                                                                                                                                                                  
## [3] "terkel-a0a0k0-a.mp3"                                                                                                                                                         
## [4] "1971"                                                                                                                                                                        
## [5] "A montage of unidentified, young voices: Children of people who went through the Depression and their experience with their parents? frugality, fear, and economic striving."
## [6] "Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving"

print(head(table$X3))

## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "terkel-a0a0k0-a.mp3"                                              
## [3] NA                                                                 
## [4] NA                                                                 
## [5] NA                                                                 
## [6] NA

print(head(table$X4))

## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "Date:"                                                            
## [3] NA                                                                 
## [4] NA                                                                 
## [5] NA                                                                 
## [6] NA

print(head(table$X5))

## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "1971"                                                             
## [3] NA                                                                 
## [4] NA                                                                 
## [5] NA                                                                 
## [6] NA

print(head(table$X6))

## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "Summary:"                                                         
## [3] NA                                                                 
## [4] NA                                                                 
## [5] NA                                                                 
## [6] NA

print(head(table$X7))

## [1] NA                                                                                                                                                                            
## [2] "A montage of unidentified, young voices: Children of people who went through the Depression and their experience with their parents? frugality, fear, and economic striving."
## [3] NA                                                                                                                                                                            
## [4] NA                                                                                                                                                                            
## [5] NA                                                                                                                                                                            
## [6] NA

print(head(table$X8))

## [1] NA          "Keywords:" NA          NA          NA          NA

print(head(table$X9))

## [1] NA                                                                            
## [2] "Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving"
## [3] NA                                                                            
## [4] NA                                                                            
## [5] NA                                                                            
## [6] NA

print(head(table$X10))

## [1] NA                "Interviewer(s):" NA                NA               
## [5] NA                NA

print(head(table$X11))

## [1] NA              "Terkel, Studs" NA              NA             
## [5] NA              NA

print(head(table$X12))

## [1] NA                "Interviewee(s):" NA                NA               
## [5] NA                NA

print(head(table$X13))

## [1] NA "" NA NA NA NA

print(head(table$X14))

## [1] NA NA NA NA NA NA

print(head(table$X15))

## [1] NA NA NA NA NA NA

# Define a function to extract summaries from text
extract_summary <- function(text) {
  # Extract text after "Summary:" until the next newline or end of string
  summary_text <- str_extract(text, "(?<=Summary:\\s)(.*?)(?=\\n|$)")
  return(summary_text)
}

# Apply the function to the column containing the text
# Tibble 'table' and the column is X1
summaries <- table$X1 %>%
  sapply(extract_summary) %>%
  na.omit() %>%
  unique() # Remove duplicates

# Create a tibble with the extracted summaries
summary_tibble <- tibble(Summary = summaries)

# Display the tibble
print(summary_tibble)

## # A tibble: 66 × 1
##    Summary                                                                      
##    <chr>                                                                        
##  1 "         A montage of unidentified, young voices: Children of people who we…
##  2 "         Interview with veteran Jimmy Sheridan on the veterans' plight and …
##  3 "         Interview with Edgar Yipsel (Yip) Harburg, a songwriter who wrote …
##  4 "         Interview with Ed Paulsen, a dayworker, on unemployment and the se…
##  5 "         Interview with Ed Paulsen, a dayworker, on unemployment and the se…
##  6 "         Interview with Pauline Kael, a film critic, on anger, violence, an…
##  7 "         Studs Terkel's overview of the second chapter of \"Hard Times: Voi…
##  8 "         Interview with Kitty McCullough, a retired seamstress who gave awa…
##  9 "         Interview with Emma Tiller, a cook who discusses how African Ameri…
## 10 "         Interview with Frank Czerwonka, a garbageman and hobo, on the code…
## # ℹ 56 more rows

# Combine all summaries into a single text
combined_text <- paste(summary_tibble$Summary, collapse = " ")

# Convert text to a tibble for processing
text_tibble <- tibble(text = combined_text)

# Tokenize the text into individual words and count their frequencies
word_counts <- text_tibble %>%
  unnest_tokens(word, text) %>%       # Tokenize the text into words
  count(word, sort = TRUE)            # Count the frequency of each word

# Alphabetize the word counts by the word column
word_counts_sorted <- word_counts %>%
  arrange(word)                      

print(word_counts)

## # A tibble: 447 × 2
##    word           n
##    <chr>      <int>
##  1 with          68
##  2 interview     64
##  3 a             52
##  4 and           50
##  5 the           49
##  6 on            43
##  7 in            18
##  8 who           18
##  9 of            14
## 10 depression    13
## # ℹ 437 more rows

# Load the necessary libraries
library(dplyr)
library(tidytext)
library(tibble)
library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

# Combine all summaries into a single text
combined_text <- paste(summary_tibble$Summary, collapse = " ")

# Convert text to a tibble for processing
text_tibble <- tibble(text = combined_text)

word_counts <- text_tibble %>%
  unnest_tokens(word, text) %>%       # Tokenize the text into words
  filter(!word %in% stop_words) %>%    # Remove stopwords
  filter(word != "interview") %>%     # Remove the specific word "interview"
  count(word, sort = TRUE)            # Count the frequency of each word

# Generate the word cloud
wordcloud(words = word_counts$word,       # Words to display
          freq = word_counts$n,           # Corresponding frequencies
          min.freq = 1,                   # Minimum frequency to display
          scale = c(3, 0.5),              # Scale for word sizes
          colors = brewer.pal(8, "Dark2"), # Colors for words
          random.order = FALSE,           # Words arranged by frequency
          rot.per = 0.35)                 # Proportion of words with rotation

library(stopwords)

# Load stopwords
stop_words <- stopwords::stopwords("en")

# List of specific words to exclude
specific_words <- c("interview", "peggy", "terry", "mary", "owsley", "william", 
                    "benton", "tom", "yoder", "jane", "robin", "langston", 
                    "slim", "collier", "phyllis", "lorimer", "dorothy", 
                    "bernstein", "larry", "van", "dusen", "paulsen", 
                    "widman", "ruth", "sam", "frank", "jimmy", "sheridan", 
                    "yip", "yipsel", "harburg", "edgar", "ed", "pauline", 
                    "kael", "kittie", "mccullough", "tiller", "emma", 
                    "louis", "czerwonka", "cesar", "chavez", "bob", "leary", 
                    "jose", "yglesias", "joe", "morrison", "evelyn", "finn", 
                    "lewis", "andreas", "mccarthy", "justin", "mike", 
                    "stinson", "sally", "rand", "doc", "graham", "tony", 
                    "soma", "jerome", "zerbe", "blankenship", "emil", "ruth", 
                    "loricks", "gardner", "means", "james", "farley",
                    "studs", "raymond", "moley", "alf", "landon", "hank", 
                    "oettinger", "jeffries", "willie", "hartman", "harry",
                    "max", "naiman", "heller", "horace", "cayton",
                    "herman", "schulmin", "anna", "ramsey", "elsa", 
                    "ponselle", "worthington", "shufro", "c", "st",
                    "isaacs", "durr", "eileen", "barthe", "nick",
                    "ben", "mrs.", "c. b", "paulson", "beecher",
                    "hentges", "hank", "terkel's", "san", "john","fred",
                    "terrell", "des", "howard", "kitty", "mis", "la", "mrs", "oscar",
                    "kelley", "ave", "went", "baldwin", 
                    "S", "overview", "80", "can", "orin", "year", "describes",
                    "c b", "discusses", "heleen", "two", "moines", "rome", "didn",
                    "ia", "t", "s")

# Combine all summaries into a single text
combined_text <- paste(summary_tibble$Summary, collapse = " ")

# Convert text to a tibble for processing
text_tibble <- tibble(text = combined_text)

# Tokenize the text, remove stopwords and specific words, and count word frequencies
word_counts <- text_tibble %>%
  unnest_tokens(word, text) %>%       # Tokenize the text into words
  filter(!word %in% stop_words) %>%    # Remove stopwords
  filter(!word %in% specific_words) %>% # Remove specific words
  count(word, sort = TRUE)            # Count the frequency of each word

# Limit the number of words to the top 100
top_words <- word_counts %>%
 top_n(1000, n)  # Select top 1000 words by frequency

# Generate the word cloud
wordcloud(words = top_words$word,       # Words to display
          freq = top_words$n,           # Corresponding frequencies
          min.freq = 1,                 # Minimum frequency to display
          scale = c(3, 0.5),            # Adjust scale for word sizes
          colors = brewer.pal(8, "Dark2"), # Colors for words
          random.order = FALSE,         # Words arranged by frequency
          rot.per = 0.35)               # Proportion of words with rotation

tibble(top_words)

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

# Load Bing sentiment lexicon
bing_sentiments <- get_sentiments("bing")

# Perform sentiment analysis
sentiment_analysis <- top_words %>%
  inner_join(bing_sentiments, by = "word")
print(sentiment_analysis)

## # A tibble: 29 × 3
##    word               n sentiment
##    <chr>          <int> <chr>    
##  1 depression        13 negative 
##  2 shame              8 negative 
##  3 relief             5 positive 
##  4 poverty            4 negative 
##  5 strike             3 negative 
##  6 bonus              2 positive 
##  7 discrimination     2 negative 
##  8 hard               2 negative 
##  9 humiliation        2 negative 
## 10 unemployed         2 negative 
## # ℹ 19 more rows

# Visualize sentiment analysis
ggplot(sentiment_analysis, aes(x = sentiment, y = n)) +
  geom_bar(stat = "identity") +
  xlab("Sentiment") +
  ylab("Count") +
  ggtitle("Sentiment Analysis of Top Words") +
  theme_minimal()

# Perform sentiment analysis
sentiment_analysis <- top_words %>%
  inner_join(bing_sentiments, by = "word")
ggplot(sentiment_analysis, aes(x = word, y = sentiment, fill = n)) +
  geom_tile() +
  scale_fill_gradient(low = "blue", high = "red") +
  labs(title = "Heatmap of Word Sentiment", x = "Word", y = "Sentiment") +
  theme_minimal() +
  coord_flip()  # Flip coordinates for better readability

ggplot(sentiment_analysis, aes(x = sentiment, y = reorder(word, n), size = n, color = sentiment)) +
  geom_point() +
  scale_size(range = c(2, 10)) +
  labs(title = "Dot Plot of Word Sentiments", x = "Sentiment", y = "Word") +
  theme_minimal()

# Bar plot
ggplot(sentiment_analysis, aes(x = reorder(word, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Bar Plot of Word Sentiments", x = "Word", y = "Count") +
  scale_fill_manual(values = c("positive" = "green", "negative" = "red")) +
  theme_minimal() +
  coord_flip()  # Flip coordinates for better readability

# Facet grid
ggplot(sentiment_analysis, aes(x = reorder(word, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  facet_wrap(~sentiment) +
  labs(title = "Facet Grid of Word Sentiments", x = "Word", y = "Count") +
  scale_fill_manual(values = c("positive" = "coral", "negative" = "tan")) +
  theme_minimal() +
  coord_flip()  # Flip coordinates for better readability