Each interview contained in a table includes a summary. Using rvest I will scrape the page of all the interview text contained in the table. I will summarize just the interview’s summary. Using dplyr, tidytext and stringr, we can establish a word count of the summary text. To control the amount of words included, by using a stop word list and creating a custom list, we can filter purposeful words. The first word cloud shows what R output before the the stop word clean up and the second excludes words I selected. This exercise was intended to be practice. I also wanted to get a sense of the words describing these interviews. The next step would be a sentiment analysis. I decided to do that at the last minute. GGplot was needed and the library was read in later down the page.
library(rvest)
## Warning: package 'rvest' was built under R version 4.3.3
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(purrr)
## Warning: package 'purrr' was built under R version 4.3.3
# Define the URL of the page to scrape
url <- "http://studsterkel.matrix.msu.edu/htimes.php"
# Read the HTML content from the URL
page <- read_html(url)
# Extract the table from the page
# The CSS selector might need adjustment based on the actual structure of the HTML
table <- page %>%
html_node("td.sterkel") %>%
html_table(fill = TRUE)
# Display the first few rows of the table
str(table)
## tibble [720 × 15] (S3: tbl_df/tbl/data.frame)
## $ X1 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "File Name:\n \n\t \n terkel-a0a0k0-a.mp3\n Date:\n 1971\n Summary:\n "| __truncated__ "File Name:" "Date:" ...
## $ X2 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "File Name:" "terkel-a0a0k0-a.mp3" "1971" ...
## $ X3 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "terkel-a0a0k0-a.mp3" NA NA ...
## $ X4 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "Date:" NA NA ...
## $ X5 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "1971" NA NA ...
## $ X6 : chr [1:720] "Interview with Children of People Who Went Through the Depression" "Summary:" NA NA ...
## $ X7 : chr [1:720] NA "A montage of unidentified, young voices: Children of people who went through the Depression and their experienc"| __truncated__ NA NA ...
## $ X8 : chr [1:720] NA "Keywords:" NA NA ...
## $ X9 : chr [1:720] NA "Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving" NA NA ...
## $ X10: chr [1:720] NA "Interviewer(s):" NA NA ...
## $ X11: chr [1:720] NA "Terkel, Studs" NA NA ...
## $ X12: chr [1:720] NA "Interviewee(s):" NA NA ...
## $ X13: chr [1:720] NA "" NA NA ...
## $ X14: logi [1:720] NA NA NA NA NA NA ...
## $ X15: logi [1:720] NA NA NA NA NA NA ...
tibble(table)
print(names(table))
## [1] "X1" "X2" "X3" "X4" "X5" "X6" "X7" "X8" "X9" "X10" "X11" "X12"
## [13] "X13" "X14" "X15"
print(head(table$X1))
## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "File Name:\n \n\t \n terkel-a0a0k0-a.mp3\n Date:\n 1971\n Summary:\n A montage of unidentified, young voices: Children of people who went through the Depression and their experience with their parents? frugality, fear, and economic striving.\n Keywords:\n \n Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving \n Interviewer(s):\n \n Terkel, Studs\n Interviewee(s):"
## [3] "File Name:"
## [4] "Date:"
## [5] "Summary:"
## [6] "Keywords:"
print(head(table$X2))
## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "File Name:"
## [3] "terkel-a0a0k0-a.mp3"
## [4] "1971"
## [5] "A montage of unidentified, young voices: Children of people who went through the Depression and their experience with their parents? frugality, fear, and economic striving."
## [6] "Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving"
print(head(table$X3))
## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "terkel-a0a0k0-a.mp3"
## [3] NA
## [4] NA
## [5] NA
## [6] NA
print(head(table$X4))
## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "Date:"
## [3] NA
## [4] NA
## [5] NA
## [6] NA
print(head(table$X5))
## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "1971"
## [3] NA
## [4] NA
## [5] NA
## [6] NA
print(head(table$X6))
## [1] "Interview with Children of People Who Went Through the Depression"
## [2] "Summary:"
## [3] NA
## [4] NA
## [5] NA
## [6] NA
print(head(table$X7))
## [1] NA
## [2] "A montage of unidentified, young voices: Children of people who went through the Depression and their experience with their parents? frugality, fear, and economic striving."
## [3] NA
## [4] NA
## [5] NA
## [6] NA
print(head(table$X8))
## [1] NA "Keywords:" NA NA NA NA
print(head(table$X9))
## [1] NA
## [2] "Depressions -- 1929, Change in lifestyle, frugality, fear, economic striving"
## [3] NA
## [4] NA
## [5] NA
## [6] NA
print(head(table$X10))
## [1] NA "Interviewer(s):" NA NA
## [5] NA NA
print(head(table$X11))
## [1] NA "Terkel, Studs" NA NA
## [5] NA NA
print(head(table$X12))
## [1] NA "Interviewee(s):" NA NA
## [5] NA NA
print(head(table$X13))
## [1] NA "" NA NA NA NA
print(head(table$X14))
## [1] NA NA NA NA NA NA
print(head(table$X15))
## [1] NA NA NA NA NA NA
# Define a function to extract summaries from text
extract_summary <- function(text) {
# Extract text after "Summary:" until the next newline or end of string
summary_text <- str_extract(text, "(?<=Summary:\\s)(.*?)(?=\\n|$)")
return(summary_text)
}
# Apply the function to the column containing the text
# Tibble 'table' and the column is X1
summaries <- table$X1 %>%
sapply(extract_summary) %>%
na.omit() %>%
unique() # Remove duplicates
# Create a tibble with the extracted summaries
summary_tibble <- tibble(Summary = summaries)
# Display the tibble
print(summary_tibble)
## # A tibble: 66 × 1
## Summary
## <chr>
## 1 " A montage of unidentified, young voices: Children of people who we…
## 2 " Interview with veteran Jimmy Sheridan on the veterans' plight and …
## 3 " Interview with Edgar Yipsel (Yip) Harburg, a songwriter who wrote …
## 4 " Interview with Ed Paulsen, a dayworker, on unemployment and the se…
## 5 " Interview with Ed Paulsen, a dayworker, on unemployment and the se…
## 6 " Interview with Pauline Kael, a film critic, on anger, violence, an…
## 7 " Studs Terkel's overview of the second chapter of \"Hard Times: Voi…
## 8 " Interview with Kitty McCullough, a retired seamstress who gave awa…
## 9 " Interview with Emma Tiller, a cook who discusses how African Ameri…
## 10 " Interview with Frank Czerwonka, a garbageman and hobo, on the code…
## # ℹ 56 more rows
# Combine all summaries into a single text
combined_text <- paste(summary_tibble$Summary, collapse = " ")
# Convert text to a tibble for processing
text_tibble <- tibble(text = combined_text)
# Tokenize the text into individual words and count their frequencies
word_counts <- text_tibble %>%
unnest_tokens(word, text) %>% # Tokenize the text into words
count(word, sort = TRUE) # Count the frequency of each word
# Alphabetize the word counts by the word column
word_counts_sorted <- word_counts %>%
arrange(word)
print(word_counts)
## # A tibble: 447 × 2
## word n
## <chr> <int>
## 1 with 68
## 2 interview 64
## 3 a 52
## 4 and 50
## 5 the 49
## 6 on 43
## 7 in 18
## 8 who 18
## 9 of 14
## 10 depression 13
## # ℹ 437 more rows
# Load the necessary libraries
library(dplyr)
library(tidytext)
library(tibble)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
# Combine all summaries into a single text
combined_text <- paste(summary_tibble$Summary, collapse = " ")
# Convert text to a tibble for processing
text_tibble <- tibble(text = combined_text)
word_counts <- text_tibble %>%
unnest_tokens(word, text) %>% # Tokenize the text into words
filter(!word %in% stop_words) %>% # Remove stopwords
filter(word != "interview") %>% # Remove the specific word "interview"
count(word, sort = TRUE) # Count the frequency of each word
# Generate the word cloud
wordcloud(words = word_counts$word, # Words to display
freq = word_counts$n, # Corresponding frequencies
min.freq = 1, # Minimum frequency to display
scale = c(3, 0.5), # Scale for word sizes
colors = brewer.pal(8, "Dark2"), # Colors for words
random.order = FALSE, # Words arranged by frequency
rot.per = 0.35) # Proportion of words with rotation
library(stopwords)
# Load stopwords
stop_words <- stopwords::stopwords("en")
# List of specific words to exclude
specific_words <- c("interview", "peggy", "terry", "mary", "owsley", "william",
"benton", "tom", "yoder", "jane", "robin", "langston",
"slim", "collier", "phyllis", "lorimer", "dorothy",
"bernstein", "larry", "van", "dusen", "paulsen",
"widman", "ruth", "sam", "frank", "jimmy", "sheridan",
"yip", "yipsel", "harburg", "edgar", "ed", "pauline",
"kael", "kittie", "mccullough", "tiller", "emma",
"louis", "czerwonka", "cesar", "chavez", "bob", "leary",
"jose", "yglesias", "joe", "morrison", "evelyn", "finn",
"lewis", "andreas", "mccarthy", "justin", "mike",
"stinson", "sally", "rand", "doc", "graham", "tony",
"soma", "jerome", "zerbe", "blankenship", "emil", "ruth",
"loricks", "gardner", "means", "james", "farley",
"studs", "raymond", "moley", "alf", "landon", "hank",
"oettinger", "jeffries", "willie", "hartman", "harry",
"max", "naiman", "heller", "horace", "cayton",
"herman", "schulmin", "anna", "ramsey", "elsa",
"ponselle", "worthington", "shufro", "c", "st",
"isaacs", "durr", "eileen", "barthe", "nick",
"ben", "mrs.", "c. b", "paulson", "beecher",
"hentges", "hank", "terkel's", "san", "john","fred",
"terrell", "des", "howard", "kitty", "mis", "la", "mrs", "oscar",
"kelley", "ave", "went", "baldwin",
"S", "overview", "80", "can", "orin", "year", "describes",
"c b", "discusses", "heleen", "two", "moines", "rome", "didn",
"ia", "t", "s")
# Combine all summaries into a single text
combined_text <- paste(summary_tibble$Summary, collapse = " ")
# Convert text to a tibble for processing
text_tibble <- tibble(text = combined_text)
# Tokenize the text, remove stopwords and specific words, and count word frequencies
word_counts <- text_tibble %>%
unnest_tokens(word, text) %>% # Tokenize the text into words
filter(!word %in% stop_words) %>% # Remove stopwords
filter(!word %in% specific_words) %>% # Remove specific words
count(word, sort = TRUE) # Count the frequency of each word
# Limit the number of words to the top 100
top_words <- word_counts %>%
top_n(1000, n) # Select top 1000 words by frequency
# Generate the word cloud
wordcloud(words = top_words$word, # Words to display
freq = top_words$n, # Corresponding frequencies
min.freq = 1, # Minimum frequency to display
scale = c(3, 0.5), # Adjust scale for word sizes
colors = brewer.pal(8, "Dark2"), # Colors for words
random.order = FALSE, # Words arranged by frequency
rot.per = 0.35) # Proportion of words with rotation
tibble(top_words)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Load Bing sentiment lexicon
bing_sentiments <- get_sentiments("bing")
# Perform sentiment analysis
sentiment_analysis <- top_words %>%
inner_join(bing_sentiments, by = "word")
print(sentiment_analysis)
## # A tibble: 29 × 3
## word n sentiment
## <chr> <int> <chr>
## 1 depression 13 negative
## 2 shame 8 negative
## 3 relief 5 positive
## 4 poverty 4 negative
## 5 strike 3 negative
## 6 bonus 2 positive
## 7 discrimination 2 negative
## 8 hard 2 negative
## 9 humiliation 2 negative
## 10 unemployed 2 negative
## # ℹ 19 more rows
# Visualize sentiment analysis
ggplot(sentiment_analysis, aes(x = sentiment, y = n)) +
geom_bar(stat = "identity") +
xlab("Sentiment") +
ylab("Count") +
ggtitle("Sentiment Analysis of Top Words") +
theme_minimal()
# Perform sentiment analysis
sentiment_analysis <- top_words %>%
inner_join(bing_sentiments, by = "word")
ggplot(sentiment_analysis, aes(x = word, y = sentiment, fill = n)) +
geom_tile() +
scale_fill_gradient(low = "blue", high = "red") +
labs(title = "Heatmap of Word Sentiment", x = "Word", y = "Sentiment") +
theme_minimal() +
coord_flip() # Flip coordinates for better readability
ggplot(sentiment_analysis, aes(x = sentiment, y = reorder(word, n), size = n, color = sentiment)) +
geom_point() +
scale_size(range = c(2, 10)) +
labs(title = "Dot Plot of Word Sentiments", x = "Sentiment", y = "Word") +
theme_minimal()
# Bar plot
ggplot(sentiment_analysis, aes(x = reorder(word, n), y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
labs(title = "Bar Plot of Word Sentiments", x = "Word", y = "Count") +
scale_fill_manual(values = c("positive" = "green", "negative" = "red")) +
theme_minimal() +
coord_flip() # Flip coordinates for better readability
# Facet grid
ggplot(sentiment_analysis, aes(x = reorder(word, n), y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
facet_wrap(~sentiment) +
labs(title = "Facet Grid of Word Sentiments", x = "Word", y = "Count") +
scale_fill_manual(values = c("positive" = "coral", "negative" = "tan")) +
theme_minimal() +
coord_flip() # Flip coordinates for better readability