]{0,}[ ]{0,}((chapters|chapter) ([ivxl|0-9+]{1,}[,|-|–]){1,}[ivxl|0-9+]{1,})(.*$)'
if re.search(pat2, summary, re.IGNORECASE):
prefix = re.match(pat2, summary, re.IGNORECASE).group(1)
summary = summary.replace(prefix,"")
return summary.strip()
multiple_summaries = 0
total_new_summaries = 0
summary_path_missing = []
counter = 0
fp = open(matched_books, "r")
fp_lines = fp.readlines()
for line in tqdm(fp_lines):
line = line.strip()
x = json.loads(line)
counter += 1
summaries_counted = 0
prev_book_unique_id = ""
book_unique_id = x['book_unique_id']
source = basename(dirname(dirname(x['summary_path'])))
book_name = basename(dirname(x['summary_path']))
section_name = basename(x['summary_path'])
# Capture the number of splits required from each summary file
splits_reqd = [item['summary_path'] for item in x['splits']]
num_splits_reqd = len(splits_reqd)
if (book_unique_id != prev_book_unique_id) and prev_book_unique_id != "":
summaries_counted = 0
summary_path = os.path.join("../", x['summary_path'])
if not os.path.exists(summary_path):
# Summary path missing
summary_path_missing.append(summary_path)
continue
try:
fx = open(summary_path, "r")
except Exception as e:
print (e)
f_all_errors.write("Error loading summary path" + "\t" +summary_path)
f_all_errors.write("\n")
continue
summary_json = json.load(fx)
summary_content = summary_json['summary']
og_summary_content = summary_content
# Handling typos in specific books that can make splitting summaries easier
if source == 'cliffnotes' and book_name == "The Merry Wives of Windsor" and 'section_10_part_0.txt' in x['summary_path']:
summary_content = "Scene 2 " + summary_content
if "CHAPTER 1Summary" in summary_content:
summary_content = summary_content.replace("CHAPTER SUMMARIES WITH NOTES PHASE THE FIRST - THE MAIDEN", "").replace("CHAPTER 1Summary", "CHAPTER 1 Summary")
if source == 'sparknotes' and book_name == "Cyrano de Bergerac":
summary_content = summary_content.replace("Act IV, scenes vi-x Summary -- Act IV", "Summary -- Act IV")
if source == 'novelguide' and book_name == "Henry VI Part 1" and section_name == 'section_4_part_0.txt':
summary_content = summary_content.replace('scence 6', 'scene 6')
if source == 'novelguide' and book_name == "Cyrano de Bergerac" and section_name == 'section_8_part_0.txt':
summary_content = summary_content.replace('Act, 5, scene 5', 'Act 5, scene 5')
if source == 'pinkmonkey' and book_name == 'Emma' and section_name == 'section_10_part_0.txt':
summary_content = summary_content.replace('CHAPTERS 11 & 12', 'CHAPTER 11-12')
if source == 'pinkmonkey' and book_name == 'Jude the Obscure' and section_name == 'section_4_part_0.txt':
summary_content = summary_content.replace('PART II CHAPTER 1 Summary', 'CHAPTER 1 Summary')
summary_content = remove_prefixes_summary(summary_content)
section_name_prefix = x['section_name_prefix']
if '-' in section_name_prefix or section_name_prefix == 'epilogue':
section_name_prefix = ""
separated_summaries = separate_mulitple_summaries(summary_content, section_name_prefix, summary_path)
total_new_summaries += len(separated_summaries.keys())
print ("num_splits_reqd: ", splits_reqd, num_splits_reqd)
print ("separated_summaries keys: ", separated_summaries.keys(), len(separated_summaries.keys()))
assert num_splits_reqd <= len(separated_summaries.keys())
# We can potentially split a summary text into more number of sections than we're able to match with the book chapters in gutenberg
# No need to separate and use the new section name if we only have one single summary found after breaking
# Also protects against some false positives wrt splitting the sections
if len(separated_summaries.keys()) == 1:
continue
else:
multiple_summaries += 1
# If there is an empty section name, it means we couldn't find a section name to match that summary to a chapter
if '' in separated_summaries.keys():
separated_summaries.pop('')
summaries_counted += save_separated_summaries(separated_summaries, summary_json, summary_path, book_name)
else:
summaries_counted += save_separated_summaries(separated_summaries, summary_json, summary_path, book_name)
os.remove(summary_path)
================================================
FILE: scripts/data_collection/bookwolf/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
"""
Note: Summaries collected through bookwolf require significant manual cleanup owing to the way the HTML is written
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
from unidecode import unidecode
import time
import pdb
# PARAMS
SUMMARY_DIR = '../../raw_summaries/bookwolf/summaries'
MAIN_SITE = 'https://web.archive.org/web/20210120012015/http://www.bookwolf.com/'
# Summary list info
summary_list_file = 'literature_links.tsv.pruned'
#File for capturing the HTTP Errors, for webpages that are not found
f_errors = open("section_errors.txt","w")
# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)
def get_overview_paragraphs(overview_links, specific_summary_dir):
for index, (overview, name) in enumerate(overview_links):
print (name, overview)
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
overview_data = soup.find("td", {"class": "TextObject"})
overview_paragraphs = [unidecode(paragraph.text.strip()) for paragraph in overview_data.findAll("p", recursive=False)[1:]]
except Exception as e:
print(e)
time.sleep(5)
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
overview_data = soup.find("td", {"class": "TextObject"})
overview_paragraphs = [unidecode(paragraph.text.strip()) for paragraph in overview_data.findAll("p", recursive=False)[1:]]
except Exception as e:
print("No book summary for: ", overview, e)
f_errors.write(overview + "\t" + name + "\t" + specific_summary_dir + "\n")
continue
overview_text = "\n".join(overview_paragraphs)
overview_dict = {}
overview_dict["name"] = "Overview"
overview_dict["summary"] = overview_text
output_fname = os.path.join(specific_summary_dir, "overview.json")
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(overview_dict, fp)
def get_section_paragraphs(section_links, specific_summary_dir):
#Fetch chapter level summary
for index, (section, name) in enumerate(section_links):
try:
print (name, section)
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
section_data = soup.find("td", {"class": "TextObject"})
except Exception as e:
print (e)
time.sleep(5)
try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
section_data = soup.find("td", {"class": "TextObject"})
except Exception as e:
print ("Chapter level summary not found for: ", section, e)
f_errors.write(section + "\t" + name + "\t" + specific_summary_dir + "\n")
continue
section_paragraphs = []
section_analysis = []
# Recursive function to parse HTML with upper case tags
def parse_upper_case_html_tags(paragraph):
paragraph_text = paragraph.text
if ">> {}. {} <<<'.format(k, title))
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory.")
# continue
# Parse page
print ("page_url: ", page_url)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
f_errors.write(str(k) + "\t" + title + "\t" + page_url + "\t" + specific_summary_dir + "\n")
continue
# Parse general summary
navigation_links = soup.find("table", {"id": "Table56"})
if navigation_links == None:
navigation_links = soup.find("td", {"class": "TextObject"})
overview_links = [(urllib.parse.urljoin(MAIN_SITE, link.get("href")), link.text) for link in navigation_links.findAll("a")\
if ("part" not in link.text.lower() and ("context" in link.get("href") or "summary" in link.get("href") or "synopsis" in link.get("href") ))]
# Filter out some of the links that are obviously not chapter summary links
# Since this source only has a handful of books, it was easy to hard code which links to fetch summaries from
section_links = [(urllib.parse.urljoin(MAIN_SITE, link.get("href")), link.text) for link in navigation_links.findAll("a") \
if ("interpretation" not in link.text.lower() and "comment" not in link.text.lower() and "author" not in link.text.lower()\
and "character" not in link.text.lower() and "questions" not in link.text.lower() and "life at the time" not in link.text.lower()\
and "theme" not in link.text.lower() and "foreword" not in link.text.lower() and "background" not in link.text.lower()\
and "symbolism" not in link.text.lower() and "introduction" not in link.text.lower() and "characterization" not in link.text.lower()\
and "setting" not in link.text.lower() and "family life" not in link.text.lower() and "comment" not in link.text.lower() \
and "context" not in link.text.lower() ) ]
if len(overview_links) != 0:
get_overview_paragraphs(overview_links, specific_summary_dir)
if len(section_links) != 0:
get_section_paragraphs(section_links, specific_summary_dir)
================================================
FILE: scripts/data_collection/bookwolf/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
# PARAMS
MAIN_SITE = 'https://web.archive.org/'
SEED_URL = 'https://web.archive.org/web/20210120012015/https://www.bookwolf.com/Welcome_to_Bookwolf1/welcome_to_bookwolf1.html'
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
soup = BeautifulSoup(urllib.request.urlopen(seed_page), "html.parser")
items = soup.findAll("tr", {"align": "LEFT", "valign": "TOP"})
tables = items[0].findAll("table")
books_table = tables[10].findAll("p")
print("Found %d items." % len(books_table))
# # Go over each section
for index, item in enumerate(books_table):
# Parse section to get bullet point text
item_title = item.find("a").text
item_url = item.find("a").get("href")
print (index)
print ("item_title: ", item_title)
print ("item_url: ", item_url, "\n")
scraped_links.append({
"title": item_title.strip().replace(",",""),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/bookwolf/literature_links.tsv.pruned
================================================
King Lear https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/King_Lear_free_booknotes/king_lear_free_booknotes.html
Jane Eyre https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Jane_Eyre_by_Charlotte_Bronte/jane_eyre.html
Macbeth https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Macbeth/macbeth.html
A Tale of Two Cities https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/A_Tale_of_Two_Cities/a_tale_of_two_cities.html
Pride and Prejudice https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Pride___Prejudice/pride___prejudice.html
Wuthering Heights https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Wuthering_Heights/wuthering_heights.html
Frankenstein https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Frankenstein_by_Mary_Shelley/frankenstein_by_mary_shelley.html
Hamlet https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Hamlet/hamlet.html
Othello https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Othello/othello.html
Julius Caesar https://web.archive.org/web/20210226213001/http://www.bookwolf.com/Free_Booknotes/Julius_Caesar/julius_caesar.html
================================================
FILE: scripts/data_collection/cliffnotes/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
from urllib.error import HTTPError, URLError
import time
# PARAMS
SUMMARY_DIR = '../../raw_summaries/cliffnotes/summaries'
MAIN_SITE = 'https://web.archive.org/web/20210312193150/https://www.cliffsnotes.com/'
# Summary list info
summary_list_file = 'literature_links.tsv.pruned'
errors_file = open("section_errors.txt","w")
def wrap_data(name, summary, analysis, url):
return {
"name": name,
"summary": summary,
"analysis": analysis,
"url": url
}
def scrape_section_continuation(parent_soup, section_header):
section_data = parent_soup.find("article", {"class": "copy"})
# For some links, the html structure is different
if section_data == None:
section_data = parent_soup.find("div", {"class": "contentArea"})
link = parent_soup.findAll("a", {"class": "cf-next icon-Next_Arrow"}, href=True)[-1]
next_link_title = link.findAll("p")[-1].text.strip()
else:
link = parent_soup.findAll("a", {"class": "nav-bttn-filled"}, href=True)[-1]
next_link_title = link.findAll("span")[-1].text.strip()
section_paragraphs = [paragraph.text.strip() for paragraph in section_data.findAll("p", recursive=False)]
if not section_header == next_link_title:
return section_paragraphs
else:
soup = BeautifulSoup(urllib.request.urlopen(urllib.parse.urljoin(MAIN_SITE, link.get("href"))), "html.parser")
return section_paragraphs + scrape_section_continuation(soup, section_header)
# Get contents of the summary file
with open(summary_list_file, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
summary_infos = list(reader)
# For each summary info
for k, (title, page_url) in enumerate(summary_infos):
print('\n>>> {}. {} <<<'.format(k, title))
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory, skipping.")
# continue
# Parse page
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
errors_file.write(page_url + "\t" + str(e) + "\n")
continue
# Parse general summary
navigation_links = soup.find("section", {"class": "secondary-navigation"})
overview_links = [urllib.parse.urljoin(MAIN_SITE, link.get("href")) for link in navigation_links.findAll("a") if re.match(".*book-summary$", link.get("href"))]
section_links = [urllib.parse.urljoin(MAIN_SITE, link.get("href")) for link in navigation_links.findAll("a") if "summary-and-analysis" in link.get("href")][1:]
for index, overview in enumerate(overview_links):
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
overview_data = soup.find("article", {"class": "copy"})
overview_paragraphs = filter(None, [paragraph.text.strip() for paragraph in overview_data.findAll("p", recursive=False)])
overview_text = "".join(overview_paragraphs).replace("Continued on next page...", "")
overview_data = wrap_data("Overview", overview_text, None, overview)
output_fname = os.path.join(specific_summary_dir, 'overview.txt')
with open(output_fname, 'w', encoding="utf-8") as f:
f.write(json.dumps(overview_data))
except Exception:
print("No book summary")
for index, section in enumerate(section_links):
try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
except URLError as err:
print (err, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
except Exception as e:
print (section, e)
errors_file.write(section + "\t" + str(e))
errors_file.write("\n")
continue
except Exception as e:
print (section, e)
errors_file.write(section + "\t" + str(e) + "\n")
continue
section_header = soup.title.string.strip()
print (section_header, section)
section_paragraphs = list(filter(None, scrape_section_continuation(soup, section_header)))
section_text = "".join(section_paragraphs).replace("Continued on next page...", "")
# clean up and parse
if "Summary\n" in section_text and "Analysis\n" in section_text:
section_text_split = section_text.split("Analysis\n")
summary_text = "".join([summary for summary in section_text_split if "Summary\n" in summary]).replace("Summary\n", "").strip()
analysis_text = "".join([analysis for analysis in section_text_split if "Summary\n" not in analysis]).replace("Analysis\n", "").strip()
else:
summary_text = section_text.replace("Summary\n", "").strip()
analysis_text = None
section_data = wrap_data(section_header, summary_text, analysis_text, section)
output_fname = os.path.join(specific_summary_dir, 'section_%d.txt' % index)
with open(output_fname, 'w', encoding="utf-8") as f:
f.write(json.dumps(section_data))
================================================
FILE: scripts/data_collection/cliffnotes/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210312193150/https://www.cliffsnotes.com/'
SEED_URL = 'https://web.archive.org/web/20210312193150/https://www.cliffsnotes.com/literature?filter=ShowAll&sort=TITLE'
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
soup = BeautifulSoup(urllib.request.urlopen(seed_page), "html.parser")
items = soup.findAll("li", {"class": "note"})
print("Found %d items." % len(items))
# Go over each section
for index, item in enumerate(items):
# Parse section to get bullet point text
item_title = item.find("div", {"class": "note-name"}).text
item_url = item.find("a").get("href")
scraped_links.append({
"title": item_title.strip().replace(",",""),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/cliffnotes/literature_links.tsv.pruned
================================================
The Taming of the Shrew https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/the-taming-of-the-shrew/play-summary
The Prince https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/the-prince/book-summary
Sense and Sensibility https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/s/sense-and-sensibility/book-summary
Far from the Madding Crowd https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/f/far-from-the-madding-crowd/book-summary
Tess of the d'Urbervilles https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/tess-of-the-durbervilles/book-summary
The Brothers Karamazov https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/b/the-brothers-karamazov/book-summary
The Picture of Dorian Gray https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/the-picture-of-dorian-gray/book-summary
Lord Jim https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/l/lord-jim/book-summary
The Red and the Black https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/r/the-red-and-the-black/book-summary
The Tempest https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/the-tempest/play-summary
The Portrait of a Lady https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/the-portrait-of-a-lady/book-summary
The Last of the Mohicans https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/l/the-last-of-the-mohicans/book-summary
Dracula https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/d/dracula/book-summary
Jane Eyre https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/j/jane-eyre/jane-eyre-at-a-glance
Adam Bede https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/adam-bede/book-summary
Madame Bovary https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/madame-bovary/book-summary
Main Street https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/main-street/book-summary
The Deerslayer https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/d/the-deerslayer/book-summary
Oliver Twist https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/o/oliver-twist/book-summary
The House of the Seven Gables https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/h/the-house-of-the-seven-gables/book-summary
Vanity Fair https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/v/vanity-fair/book-summary
Winesburg Ohio https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/w/winesburg-ohio/about-winesburg-ohio
Babbitt https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/b/babbitt/sinclair-lewis-biography
A Tale of Two Cities https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/a-tale-of-two-cities/a-tale-of-two-cities-at-a-glance
The House of Mirth https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/h/the-house-of-mirth/book-summary
The Scarlet Letter https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/s/the-scarlet-letter/the-scarlet-letter-at-a-glance
Alice's Adventures in Wonderland https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/alices-adventures-in-wonderland/book-summary
Pride and Prejudice https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/pride-and-prejudice/book-summary
Gulliver's Travels https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/g/gullivers-travels/book-summary
Wuthering Heights https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/w/wuthering-heights/wuthering-heights-at-a-glance
Frankenstein https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/f/frankenstein/frankenstein-at-a-glance
Jude the Obscure https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/j/jude-the-obscure/book-summary
Siddhartha https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/s/siddhartha/about-siddhartha
Ivanhoe https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/i/ivanhoe/book-summary
Treasure Island https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/treasure-island/book-summary
David Copperfield https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/d/david-copperfield/book-summary
Sister Carrie https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/s/sister-carrie/book-summary
Dr. Jekyll and Mr. Hyde https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/d/dr-jekyll-and-mr-hyde/book-summary
The Turn of the Screw https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/the-turn-of-the-screw/book-summary
Candide https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/c/candide/book-summary
Paradise Lost https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/paradise-lost/poem-summary
My Antonia https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/my-aacutentonia/book-summary
Hamlet https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/h/hamlet/hamlet-at-a-glance
A Midsummer Night's Dream https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/a-midsummer-nights-dream/play-summary
Othello https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/o/othello/play-summary
King Henry IV Part 1 https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/k/king-henry-iv-part-1/play-summary
Narrative of the Life of Frederick Douglass: An American Slave https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/n/narrative-of-the-life-of-frederick-douglass-an-american-slave/book-summary
Heart of Darkness https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/h/heart-of-darkness/heart-of-darkness-at-a-glance
Green Mansions https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/g/green-mansions/book-summary
The Power and the Glory https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/the-power-and-the-glory/about-the-power-and-the-glory
The Return of the Native https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/r/the-return-of-the-native/book-summary
The Pickwick Papers https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/the-pickwick-papers/book-summary
The Rise of Silas Lapham https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/r/the-rise-of-silas-lapham/about-the-rise-of-silas-lapham
The Red Badge of Courage https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/r/the-red-badge-of-courage/book-summary
The Mill on the Floss https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/the-mill-on-the-floss/book-summary
The Way of All Flesh https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/w/the-way-of-all-flesh/book-summary
Incidents in the Life of a Slave Girl https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/i/incidents-in-the-life-of-a-slave-girl/about-incidents-in-the-life-of-a-slave-girl
Macbeth https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/macbeth/macbeth-at-a-glance
Richard II https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/r/richard-ii/play-summary
Richard III https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/r/richard-iii/play-summary
Henry V https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/h/henry-v/play-summary
Julius Caesar https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/j/julius-caesar/play-summary
Love's Labour's Lost https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/l/loves-labours-lost/play-summary
Pygmalion https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/pygmalion/play-summary
Twelfth Night https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/twelfth-night/play-summary
Two Gentlemen of Verona https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/two-gentlemen-of-verona/play-summary
A Doll's House https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/d/a-dolls-house/play-summary
Arms and the Man https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/arms-and-the-man/play-summary
Ghosts https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/g/ghosts/play-summary
Hedda Gabler https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/h/hedda-gabler/play-summary
The Merry Wives of Windsor https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/the-merry-wives-of-windsor/play-summary
Don Juan https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/d/don-juan/poem-summary
Tartuffe https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/tartuffe/play-summary
An Enemy of the People https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/e/an-enemy-of-the-people/play-summary
The Secret Sharer https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/s/the-secret-sharer/story-summary
Man and Superman https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/man-and-superman/about-man-and-superman
Idylls of the King https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/i/idylls-of-the-king/about-idylls-of-the-king
Romeo and Juliet https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/r/romeo-and-juliet/romeo-and-juliet-at-a-glance
Phaedra https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/p/phaedra/play-summary
The Merchant of Venice https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/the-merchant-of-venice/play-summary
All's Well That Ends Well https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/alls-well-that-ends-well/play-summary
Major Barbara https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/major-barbara/play-summary
The Three Musketeers https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/t/the-three-musketeers/book-summary
Cyrano de Bergerac https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/c/cyrano-de-bergerac/play-summary
The Winter's Tale https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/w/the-winters-tale/play-summary
King Lear https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/k/king-lear/play-summary
Bleak House https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/b/bleak-house/book-summary
The Adventures of Huckleberry Finn https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/the-adventures-of-huckleberry-finn/the-adventures-of-huckleberry-finn-at-a-glance
Anthem https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/anthem/book-summary
Ethan Frome https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/e/ethan-frome/book-summary
The Jungle https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/j/the-jungle/about-the-jungle
The Ambassadors https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/the-ambassadors/book-summary
The Age of Innocence https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/the-age-of-innocence/book-summary
The Importance of Being Earnest https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/i/the-importance-of-being-earnest/play-summary
The Education of Henry Adams https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/e/the-education-of-henry-adams/book-summary
Much Ado About Nothing https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/m/much-ado-about-nothing/play-summary
Antony and Cleopatra https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/antony-and-cleopatra/play-summary
As You Like It https://web.archive.org/web/20201107055349/https://www.cliffsnotes.com/literature/a/as-you-like-it/play-summary
================================================
FILE: scripts/data_collection/gradesaver/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
import time
from urllib.error import HTTPError, URLError
# PARAMS
SUMMARY_DIR = '../../raw_summaries/gradesaver/summaries'
MAIN_SITE = 'https://web.archive.org/web/20210630200124/https://www.gradesaver.com/'
# Summary list info
summary_list_file = 'literature_links.tsv.pruned'
errors_file = open("section_errors.txt","w")
# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)
# For each summary info
for k, (title, page_url) in enumerate(summary_infos):
print('\n>>> {}. {} {} <<<'.format(k, title, page_url))
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory, skipping.")
# continue
# Parse page
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except URLError as err:
print (err, page_url, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
errors_file.write(page_url + "\t" + str(e))
errors_file.write("\n")
continue
except Exception as e:
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
errors_file.write(page_url + "\t" + str(e))
continue
# # Parse general summary
navigation_links = soup.find("ul", {"class": "navSection__list js--collapsible"})
overview_links = [(urllib.parse.urljoin(MAIN_SITE, link.find("a").get("href")), link.text.strip()) for link in navigation_links.findAll("li") if link.text.strip() == title + " Summary"]
print ("overview_links: ", overview_links)
if len(overview_links) == 0:
print ("No overview summaries found")
else:
for index, (overview, name) in enumerate(overview_links):
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
overview_data = soup.find("article", {"class": "section__article"})
overview_paragraphs = []
overview_analysis = []
start = 1
for paragraph in overview_data.findAll("p", recursive=False):
#Skip the first word if it is "Summary" or "Context"
if paragraph.text.strip().lower() in ["summary", "context"]:
continue
if paragraph.text.strip().lower() in ["analysis"]:
start = 0
continue
if start:
overview_paragraphs.append(paragraph.text.strip())
else:
overview_analysis.append(paragraph.text.strip())
overview_text = "".join(overview_paragraphs)
overview_analysis_text = "".join(overview_analysis)
overview_dict = {}
overview_dict["name"] = "Overview"
overview_dict["summary"] = overview_text
overview_dict["analysis"] = overview_analysis_text
overview_dict["url"] = overview
# print (overview_analysis_text)
output_fname = os.path.join(specific_summary_dir, "overview.json")
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(overview_dict, fp)
except Exception as e:
print ("No overview summary for: ", overview)
print (e)
section_links = [link.find("ul").findAll("li") for link in navigation_links.findAll("li") if "Summary And Analysis" in link.text.strip()]
if len(section_links) == 0:
print ("No section summaries found")
else:
section_links = [(urllib.parse.urljoin(MAIN_SITE,link.find("a").get("href")), link.text.strip()) for link in section_links[0]]
for index, (section, name) in enumerate(section_links):
try:
print (name, section)
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
except URLError as err:
print (err, section, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
except Exception as e:
print (section, e)
errors_file.write(section + "\t" + str(e))
errors_file.write("\n")
continue
except Exception as e:
print ("No section summary found for: ", section, e)
errors_file.write(section + "\t" + str(e))
errors_file.write("\n")
continue
section_data = soup.find("article", {"class": "section__article"})
section_paragraphs = []
section_analysis = []
start = 1
for paragraph in section_data.findAll(["p","h2", "h3", "h4"]):
# Handle chapter name occuring before the title 'Summary'
# We want to capture it, not skip it
if re.search('^(chapter [ivxl|0-9]+)', paragraph.text.strip().lower(), re.IGNORECASE):
section_paragraphs.append(paragraph.text.strip())
continue
#Skip the first sentence if it is "Summary" or "Context"
elif paragraph.text.strip().lower() in ["summary", "context"]:
start = 1
continue
if paragraph.text.strip().lower() in ["analysis"]:
start = 0
continue
if start:
section_paragraphs.append(paragraph.text.strip())
else:
section_analysis.append(paragraph.text.strip())
# print ("section_paragraphs: ", section_paragraphs)
section_text = "".join(section_paragraphs)
section_analysis_text = "".join(section_analysis)
# print ("section_text: ", section_text)
section_dict = {}
section_dict["name"] = name
section_dict["summary"] = section_text
section_dict["analysis"] = section_analysis_text
section_dict["url"] = section
output_fname = os.path.join(specific_summary_dir, 'section_%d.txt' % index)
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(section_dict, fp)
================================================
FILE: scripts/data_collection/gradesaver/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, string
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
import time
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210226083212/https://www.gradesaver.com/'
SEED_URL = 'https://web.archive.org/web/20210226083212/https://www.gradesaver.com/study-guides/'
alphabet_list = string.ascii_uppercase
errors_file = open("link_errors.txt","w")
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
for char in alphabet_list:
books_page = seed_page + char
try:
soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser")
except Exception as e:
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser")
except Exception as e:
print ("Skipping book: ", books_page)
errors_file.write(books_page + "\t" + str(e) + "\n")
items = soup.findAll("ul", {"class": "columnList"})
books = items[0].findAll("li", {"class":"columnList__item"})
# # Go over each section
for index, item in enumerate(books):
# Parse section to get bullet point text
item_title = item.find("a").text
item_url = item.find("a").get("href")
print ("item_title: ", item_title.strip())
print ("item_url: ", item_url.strip(), "\n")
scraped_links.append({
"title": item_title.strip(),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/gradesaver/literature_links.tsv.pruned
================================================
The Prince https://web.archive.org/web/20210417180108/https://www.gradesaver.com/the-prince
Sense and Sensibility https://web.archive.org/web/20210417183050/https://www.gradesaver.com/sense-and-sensibility
Far from the Madding Crowd https://web.archive.org/web/20210417171037/https://www.gradesaver.com/far-from-the-madding-crowd
Tess of the D'Urbervilles https://web.archive.org/web/20210212015315/https://www.gradesaver.com/tess-of-the-durbervilles
The Brothers Karamazov https://web.archive.org/web/20210417165227/https://www.gradesaver.com/the-brothers-karamazov
The Picture of Dorian Gray https://web.archive.org/web/20210417180108/https://www.gradesaver.com/the-picture-of-dorian-gray
Lord Jim https://web.archive.org/web/20210417183139/https://www.gradesaver.com/lord-jim
The Consolation of Philosophy https://web.archive.org/web/20210417175939/https://www.gradesaver.com/the-consolation-of-philosophy
The Rime of the Ancient Mariner https://web.archive.org/web/20210417180420/https://www.gradesaver.com/the-rime-of-the-ancient-mariner
The Autobiography of an Ex-Colored Man https://web.archive.org/web/20210417172435/https://www.gradesaver.com/the-autobiography-of-an-excolored-man
The White Devil https://web.archive.org/web/20210417183140/https://www.gradesaver.com/the-white-devil
The Tempest https://web.archive.org/web/20210212015315/https://www.gradesaver.com/the-tempest
King Solomon's Mines https://web.archive.org/web/20210417175853/https://www.gradesaver.com/king-solomons-mines
Gargantua and Pantagruel https://web.archive.org/web/20210417171712/https://www.gradesaver.com/gargantua-and-pantagruel
Uncle Vanya https://web.archive.org/web/20210417165119/https://www.gradesaver.com/uncle-vanya
The Portrait of a Lady https://web.archive.org/web/20210417180108/https://www.gradesaver.com/the-portrait-of-a-lady
The School for Scandal https://web.archive.org/web/20210417183050/https://www.gradesaver.com/the-school-for-scandal
Dracula https://web.archive.org/web/20210417171105/https://www.gradesaver.com/dracula
Jane Eyre https://web.archive.org/web/20210417165212/https://www.gradesaver.com/jane-eyre
Adam Bede https://web.archive.org/web/20210417172435/https://www.gradesaver.com/adam-bede
The Hound of the Baskervilles https://web.archive.org/web/20210417171128/https://www.gradesaver.com/the-hound-of-the-baskervilles
The Aeneid https://web.archive.org/web/20210417172435/https://www.gradesaver.com/the-aeneid
Oliver Twist https://web.archive.org/web/20210417180116/https://www.gradesaver.com/oliver-twist
The House of the Seven Gables https://web.archive.org/web/20210417171128/https://www.gradesaver.com/the-house-of-the-seven-gables
Vanity Fair https://web.archive.org/web/20210417172348/https://www.gradesaver.com/vanity-fair
Little Women https://web.archive.org/web/20210417183139/https://www.gradesaver.com/little-women
Babbitt https://web.archive.org/web/20210417165227/https://www.gradesaver.com/babbitt
A Tale of Two Cities https://web.archive.org/web/20210212015315/https://www.gradesaver.com/tale-of-two-cities
The Scarlet Letter https://web.archive.org/web/20210417183050/https://www.gradesaver.com/the-scarlet-letter
Emma https://web.archive.org/web/20210417182948/https://www.gradesaver.com/emma
Persuasion https://web.archive.org/web/20210417180108/https://www.gradesaver.com/persuasion
Pride and Prejudice https://web.archive.org/web/20210417180108/https://www.gradesaver.com/pride-and-prejudice
Gulliver's Travels https://web.archive.org/web/20210417171712/https://www.gradesaver.com/gullivers-travels
Wuthering Heights https://web.archive.org/web/20210417183140/https://www.gradesaver.com/wuthering-heights
Frankenstein https://web.archive.org/web/20210417171037/https://www.gradesaver.com/frankenstein
Mansfield Park https://web.archive.org/web/20210417171658/https://www.gradesaver.com/mansfield-park
Middlemarch https://web.archive.org/web/20210417171658/https://www.gradesaver.com/middlemarch
Siddhartha https://web.archive.org/web/20210417183050/https://www.gradesaver.com/siddhartha
Treasure Island https://web.archive.org/web/20210212015315/https://www.gradesaver.com/treasure-island
David Copperfield https://web.archive.org/web/20210417171105/https://www.gradesaver.com/david-copperfield
Sister Carrie https://web.archive.org/web/20210417183050/https://www.gradesaver.com/sister-carrie
Kidnapped https://web.archive.org/web/20210417175853/https://www.gradesaver.com/kidnapped
A Christmas Carol https://web.archive.org/web/20210417175939/https://www.gradesaver.com/a-christmas-carol
Dr. Jekyll and Mr. Hyde https://web.archive.org/web/20210417171105/https://www.gradesaver.com/dr-jekyll-and-mr-hyde
The Turn of the Screw https://web.archive.org/web/20210212015315/https://www.gradesaver.com/turn-of-the-screw
Candide https://web.archive.org/web/20210417175939/https://www.gradesaver.com/candide
Paradise Lost https://web.archive.org/web/20210417180108/https://www.gradesaver.com/paradise-lost
The Federalist Papers https://web.archive.org/web/20210417171037/https://www.gradesaver.com/the-federalist-papers
The Rivals https://web.archive.org/web/20210417180420/https://www.gradesaver.com/the-rivals
Northanger Abbey https://web.archive.org/web/20210417165319/https://www.gradesaver.com/northanger-abbey
My Antonia https://web.archive.org/web/20210417171658/https://www.gradesaver.com/my-antonia
A Vindication of the Rights of Woman https://web.archive.org/web/20210417172348/https://www.gradesaver.com/a-vindication-of-the-rights-of-woman
A Study in Scarlet https://web.archive.org/web/20210417183050/https://www.gradesaver.com/a-study-in-scarlet
Hamlet https://web.archive.org/web/20210417171128/https://www.gradesaver.com/hamlet
A Midsummer Night's Dream https://web.archive.org/web/20210417171658/https://www.gradesaver.com/midsummer-nights-dream
Othello https://web.archive.org/web/20210417180116/https://www.gradesaver.com/othello
A Room With a View https://web.archive.org/web/20210417180420/https://www.gradesaver.com/a-room-with-a-view
Mary Barton https://web.archive.org/web/20210417171658/https://www.gradesaver.com/mary-barton
Coriolanus https://web.archive.org/web/20210417175939/https://www.gradesaver.com/coriolanus
The Garden Party https://web.archive.org/web/20210417171712/https://www.gradesaver.com/the-garden-party
Around the World in 80 Days https://web.archive.org/web/20210417172435/https://www.gradesaver.com/around-the-world-in-80-days
What Maisie Knew https://web.archive.org/web/20210417183140/https://www.gradesaver.com/what-maisie-knew
The Vicar of Wakefield https://web.archive.org/web/20210417172348/https://www.gradesaver.com/the-vicar-of-wakefield
Utilitarianism https://web.archive.org/web/20210417165119/https://www.gradesaver.com/utilitarianism
Heart of Darkness https://web.archive.org/web/20210417171128/https://www.gradesaver.com/heart-of-darkness
A Hero of Our Time https://web.archive.org/web/20210417171128/https://www.gradesaver.com/a-hero-of-our-time
The Phantom of the Opera https://web.archive.org/web/20210417180108/https://www.gradesaver.com/the-phantom-of-the-opera
Leviathan https://web.archive.org/web/20210417183139/https://www.gradesaver.com/leviathan
The Canterville Ghost https://web.archive.org/web/20210417175939/https://www.gradesaver.com/the-canterville-ghost
Villette https://web.archive.org/web/20210417172348/https://www.gradesaver.com/villette
On Liberty https://web.archive.org/web/20210417180116/https://www.gradesaver.com/on-liberty
The Marrow of Tradition https://web.archive.org/web/20210417171658/https://www.gradesaver.com/the-marrow-of-tradition
The Wind in the Willows https://web.archive.org/web/20210417183140/https://www.gradesaver.com/the-wind-in-the-willows
Howards End https://web.archive.org/web/20210417171128/https://www.gradesaver.com/howards-end
The Secret Agent https://web.archive.org/web/20210417183050/https://www.gradesaver.com/the-secret-agent
Anne of Green Gables https://web.archive.org/web/20210417172435/https://www.gradesaver.com/anne-of-green-gables
A Little Princess https://web.archive.org/web/20210417183139/https://www.gradesaver.com/a-little-princess
Regeneration https://web.archive.org/web/20210417180420/https://www.gradesaver.com/regeneration
Therese Raquin https://web.archive.org/web/20210212015315/https://www.gradesaver.com/therese-raquin
The Monkey's Paw https://web.archive.org/web/20210417171658/https://www.gradesaver.com/the-monkeys-paw
Twelve Years a Slave https://web.archive.org/web/20210212015315/https://www.gradesaver.com/twelve-years-a-slave
House of Mirth https://web.archive.org/web/20210417171128/https://www.gradesaver.com/house-of-mirth
The Rise of Silas Lapham https://web.archive.org/web/20210417180420/https://www.gradesaver.com/the-rise-of-silas-lapham
The Red Badge of Courage https://web.archive.org/web/20210417180420/https://www.gradesaver.com/the-red-badge-of-courage
O Pioneers https://web.archive.org/web/20210417180116/https://www.gradesaver.com/o-pioneers
The Mill on the Floss https://web.archive.org/web/20210417171658/https://www.gradesaver.com/the-mill-on-the-floss
The Time Machine https://web.archive.org/web/20210212015315/https://www.gradesaver.com/the-time-machine
The Valley of Fear https://web.archive.org/web/20210417172348/https://www.gradesaver.com/the-valley-of-fear
News from Nowhere https://web.archive.org/web/20210417165319/https://www.gradesaver.com/news-from-nowhere
Little Dorrit https://web.archive.org/web/20210417183139/https://www.gradesaver.com/little-dorrit
Incidents in the Life of a Slave Girl https://web.archive.org/web/20210417175859/https://www.gradesaver.com/incidents-in-the-life-of-a-slave-girl
Henry IV Part 2 https://web.archive.org/web/20210417171128/https://www.gradesaver.com/henry-iv-part-2
Macbeth https://web.archive.org/web/20210417171658/https://www.gradesaver.com/macbeth
Troilus and Cressida https://web.archive.org/web/20210212015315/https://www.gradesaver.com/troilus-and-cressida
Richard II https://web.archive.org/web/20210417180420/https://www.gradesaver.com/richard-ii
Henry IV Part 1 https://web.archive.org/web/20210417171128/https://www.gradesaver.com/henry-iv-part-1
Richard III https://web.archive.org/web/20210417180420/https://www.gradesaver.com/richard-iii
Henry V https://web.archive.org/web/20210417171128/https://www.gradesaver.com/henry-v
Every Man in His Humour https://web.archive.org/web/20210417182948/https://www.gradesaver.com/every-man-in-his-humour
Titus Andronicus https://web.archive.org/web/20210212015315/https://www.gradesaver.com/titus-andronicus
Julius Caesar https://web.archive.org/web/20210417165212/https://www.gradesaver.com/julius-caesar
Second Treatise of Government https://web.archive.org/web/20210417183050/https://www.gradesaver.com/second-treatise-of-government
She Stoops to Conquer https://web.archive.org/web/20210417183050/https://www.gradesaver.com/she-stoops-to-conquer
The Wonderful Wizard of Oz https://web.archive.org/web/20210417183140/https://www.gradesaver.com/the-wonderful-wizard-of-oz
Pygmalion https://web.archive.org/web/20210417180108/https://www.gradesaver.com/pygmalion
Twelfth Night https://web.archive.org/web/20210212015315/https://www.gradesaver.com/twelfth-night
A Doll's House https://web.archive.org/web/20210417171105/https://www.gradesaver.com/a-dolls-house
Arms and the Man https://web.archive.org/web/20210417172435/https://www.gradesaver.com/arms-and-the-man
Ghosts https://web.archive.org/web/20210417171712/https://www.gradesaver.com/ghosts
The Spanish Tragedy https://web.archive.org/web/20210417183050/https://www.gradesaver.com/the-spanish-tragedy
Tartuffe https://web.archive.org/web/20210212015315/https://www.gradesaver.com/tartuffe
An Enemy of the People https://web.archive.org/web/20210417182948/https://www.gradesaver.com/an-enemy-of-the-people
Measure for Measure https://web.archive.org/web/20210417171658/https://www.gradesaver.com/measure-for-measure
The Seagull https://web.archive.org/web/20210417183050/https://www.gradesaver.com/the-seagull
Alice in Wonderland https://web.archive.org/web/20210417172435/https://www.gradesaver.com/alice-in-wonderland
Njal's Saga https://web.archive.org/web/20210417165319/https://www.gradesaver.com/njals-saga
White Fang https://web.archive.org/web/20210417183140/https://www.gradesaver.com/white-fang
Romeo and Juliet https://web.archive.org/web/20210417180420/https://www.gradesaver.com/romeo-and-juliet
The Ramayana https://web.archive.org/web/20210417180420/https://www.gradesaver.com/the-ramayana
Cymbeline https://web.archive.org/web/20210417175939/https://www.gradesaver.com/cymbeline
Troilus and Criseyde https://web.archive.org/web/20210212015315/https://www.gradesaver.com/troilus-and-criseyde
The Sorrows of Young Werther https://web.archive.org/web/20210417183050/https://www.gradesaver.com/the-sorrows-of-young-werther
The Hairy Ape https://web.archive.org/web/20210417171128/https://www.gradesaver.com/the-hairy-ape
All for Love https://web.archive.org/web/20210417172435/https://www.gradesaver.com/all-for-love
The Jew of Malta https://web.archive.org/web/20210417165212/https://www.gradesaver.com/the-jew-of-malta
The Coquette https://web.archive.org/web/20210417175939/https://www.gradesaver.com/the-coquette
An Ideal Husband https://web.archive.org/web/20210417175859/https://www.gradesaver.com/an-ideal-husband
The Three Musketeers https://web.archive.org/web/20210212015315/https://www.gradesaver.com/the-three-musketeers
Black Beauty https://web.archive.org/web/20210417165227/https://www.gradesaver.com/black-beauty
Evelina https://web.archive.org/web/20210417182948/https://www.gradesaver.com/evelina-or-the-history-of-a-young-ladys-entrance-into-the-world
The Winter's Tale https://web.archive.org/web/20210417183140/https://www.gradesaver.com/the-winters-tale
King Lear https://web.archive.org/web/20210417175853/https://www.gradesaver.com/king-lear
Bleak House https://web.archive.org/web/20210417165227/https://www.gradesaver.com/bleak-house
The Adventures of Huckleberry Finn https://web.archive.org/web/20210417172435/https://www.gradesaver.com/the-adventures-of-huckleberry-finn
Anthem https://web.archive.org/web/20210417172435/https://www.gradesaver.com/anthem
Ethan Frome https://web.archive.org/web/20210417182948/https://www.gradesaver.com/ethan-frome
The Jungle https://web.archive.org/web/20210417165212/https://www.gradesaver.com/the-jungle
The Age of Innocence https://web.archive.org/web/20210417172435/https://www.gradesaver.com/the-age-of-innocence
Mrs. Warren's Profession https://web.archive.org/web/20210417171658/https://www.gradesaver.com/mrs-warrens-profession
The Blithedale Romance https://web.archive.org/web/20210417165227/https://www.gradesaver.com/the-blithedale-romance
Dombey and Son https://web.archive.org/web/20210417171105/https://www.gradesaver.com/dombey-and-son
Much Ado About Nothing https://web.archive.org/web/20210417171658/https://www.gradesaver.com/much-ado-about-nothing
The Secret Garden https://web.archive.org/web/20210417183050/https://www.gradesaver.com/the-secret-garden
The Playboy of the Western World https://web.archive.org/web/20210417180108/https://www.gradesaver.com/the-playboy-of-the-western-world
The Duchess of Malfi https://web.archive.org/web/20210417171105/https://www.gradesaver.com/the-duchess-of-malfi
The Taming of the Shrew https://web.archive.org/web/20210212015315/https://www.gradesaver.com/the-taming-of-the-shrew
Antony and Cleopatra https://web.archive.org/web/20210417172435/https://www.gradesaver.com/antony-and-cleopatra
As You Like It https://web.archive.org/web/20210417172435/https://www.gradesaver.com/as-you-like-it
================================================
FILE: scripts/data_collection/novelguide/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json, time
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
from unidecode import unidecode
import argparse
PARSER = argparse.ArgumentParser(description='For processing HTTP errors separately')
PARSER.add_argument("--fix_scraping_errors", action="store_true", help="Flag indicating \
that script should recrape the links it missed")
ARGS = PARSER.parse_args()
# PARAMS
SUMMARY_DIR = '../../raw_summaries/novelguide/summaries'
MAIN_SITE = 'https://web.archive.org/web/20210708184822/https://www.novelguide.com/'
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def get_section_level_data(section_links):
http_errors = []
for index, (section, name), specific_summary_dir in section_links:
print (name, section)
try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
section_data = soup.find("div", {"class": "content clear-block"})
section_paragraphs = []
section_analysis = []
section_paras = section_data.findAll("p")
for para in section_paras:
if para.text.strip():
section_paragraphs.append(unidecode(para.text.strip()))
# Try alternate
if section_paragraphs == []:
section_paras = section_data.findAll("div")
# print ("section_paras: ", section_paras)
for para in section_paras:
section_paragraphs.append(unidecode(para.text.strip()))
section_text = "".join(section_paragraphs)
section_analysis_text = "".join(section_analysis)
#Actual analysis text to be extracted later
section_dict = {}
# All section names have colons, so we can get the section name by splitting on colons and taking the last item
section_dict["name"] = unidecode(name.split(':')[-1].strip())
section_dict["summary"] = section_text
section_dict["analysis"] = section_analysis_text
section_dict["url"] = unidecode(section)
#Check for the overview
if section_dict["name"] in ["Overview", "Novel Summary", "NovelSummary", "Summary"]:
output_fname = os.path.join(specific_summary_dir, 'overview.txt')
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(section_dict, fp)
else: #Must be section file
output_fname = os.path.join(specific_summary_dir, 'section_%d.txt' % int(index))
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(section_dict, fp)
except Exception as e:
print ("No section summary for: ", section)
print (e)
time.sleep(5)
http_errors.append((index, section, name, specific_summary_dir))
# print ("http_errors: ", http_errors)
#Errors File is created for saving urls that are not found, before calling this function
f_errors = open("section_errors.txt","a")
for (index, section, name, specific_summary_dir) in http_errors:
f_errors.write(str(index) + "\t" + section + "\t" + name + "\t" + specific_summary_dir + "\n")
# fetch only the links that resulted in an http error
if ARGS.fix_scraping_errors:
if not os.path.exists("section_errors.txt"):
print ("No errors file found\nRun without scraping errors flag")
exit()
else:
f_errors = open("section_errors.txt","r")
section_links = []
for line in f_errors:
line_splits = line.rstrip().split("\t")
section_links.append((line_splits[0],(line_splits[1], line_splits[2]), line_splits[3]))
f_errors.close()
if len(section_links) == 0:
print ("No errors found\nRun without scraping errors flag")
exit()
print ("Links with scraping errors scraped again: ", section_links)
#Create the errors file every time when starting to scrape the summaries
#Should overwrite the same file we opened for reading
f_errors = open("section_errors.txt","w")
#fetch the summaries using the links that threw an error
get_section_level_data(section_links)
# Summary list info
summary_list_file = 'literature_links.tsv.pruned'
# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)
# Create the errors file every time when starting to scrape the summaries
# This file can be used to try and rescrape the links that resulted in an error
f_errors = open("section_errors.txt","w")
f_book_errors = open("book_errors.txt","w")
# For each summary info
for k, (title, page_url) in enumerate(summary_infos):
print('\n>>> {}. {} <<<'.format(k, title))
overview_found = 0
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory.")
# continue
# Parse page
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except urllib.error.HTTPError:
print ("HTTP error raised. Trying again")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except urllib.error.HTTPError:
print ("Page not accessible: ", page_url)
f_book_errors.write(str(k) + "\t" + title + "\t" + page_url)
f_book_errors.write("\n")
continue
# # Parse general summary
navigation_links = soup.find("div", {"id": "block-booknavigation-3"})
# Some links are just empty webpages
if navigation_links == None:
print ("Navigation links not found")
f_book_errors.write(str(k) + "\t" + title + "\t" + page_url)
f_book_errors.write("\n")
continue
section_links = [(urllib.parse.urljoin(MAIN_SITE, link.find("a").get("href")), link.text.strip()) for link in navigation_links.findAll("li")\
if 'chapter' in link.text.strip().lower() or 'summary' in link.text.strip().lower() or 'section' in link.text.strip().lower() or 'stave' in link.text.strip().lower() \
or 'chp' in link.text.strip().lower() or 'scene' in link.text.strip().lower() or 'act ' in link.text.strip().lower() \
or 'part' in link.text.strip().lower() or 'pages' in link.text.strip().lower() or 'lines' in link.text.strip().lower() \
or 'book' in link.text.strip().lower() or hasNumbers(link.text.strip().lower()) or 'overview' in link.text.strip().lower()\
or 'prologue' in link.text.strip().lower() or 'epilogue' in link.text.strip().lower()]
#Why not checking for the keyword 'summary'??
# Append index to all the section links
section_links_with_index = []
for index, (section, name) in enumerate(section_links):
section_links_with_index.append((index,(section, name), specific_summary_dir))
if len(section_links_with_index) == 0:
print ("No section summaries found")
else:
get_section_level_data(section_links_with_index)
================================================
FILE: scripts/data_collection/novelguide/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, string
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/'
SEED_URL = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/title/'
alphabet_list = string.ascii_lowercase + '1'
errors_file = open("link_errors.txt","w")
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
for char in alphabet_list:
page_no = 1
books_page = seed_page + char
while(True):
try:
soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser")
items = soup.findAll("ul", {"class": "search-title"})
books = items[0].findAll("li")
# # Go over each section
for index, item in enumerate(books):
# Parse section to get bullet point text
item_title = item.find("a").text
item_url = item.find("a").get("href")
print ("item_title: ", item_title.strip())
print ("item_url: ", item_url.strip())
print ("\n")
scraped_links.append({
"title": item_title.strip(),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
except Exception as e:
print (books_page, str(e))
errors_file.write(books_page + "\t" + str(e) + "\n")
break
books_page = seed_page + char + "?page=" + str(page_no)
page_no += 1
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/novelguide/literature_links.tsv.pruned
================================================
Consolation of Philosophy https://web.archive.org/web/20210212220448/https://www.novelguide.com/consolation-of-philosophy
Mansfield Park https://web.archive.org/web/20210212220253/https://www.novelguide.com/mansfield-park/index.html
My Antonia https://web.archive.org/web/20210212220253/https://www.novelguide.com/my-antonia/index.html
Utopia https://web.archive.org/web/20210214151736/https://www.novelguide.com/utopia
Merry Wives of Windsor https://web.archive.org/web/20210212220253/https://www.novelguide.com/merry-wives-of-windsor
Lord Jim https://web.archive.org/web/20201204111055/https://www.novelguide.com/lord-jim/index.html
Common Sense https://web.archive.org/web/20210212220448/https://www.novelguide.com/common-sense/introduction
Richard II https://web.archive.org/web/20210708184822/https://www.novelguide.com/richard-ii/index.html
Love's Labours Lost https://web.archive.org/web/20201204111055/https://www.novelguide.com/loves-labours-lost/index.html
Bleak House https://web.archive.org/web/20201020010735/https://www.novelguide.com/bleak-house
Sense and Sensibility https://web.archive.org/web/20210212220225/https://www.novelguide.com/sense-and-sensibility/index.html
Tess of the d'Urbervilles https://web.archive.org/web/20210212221210/https://www.novelguide.com/tess-of-the-durbervilles
Dracula https://web.archive.org/web/20210212220747/https://www.novelguide.com/dracula/index.htm
Jane Eyre https://web.archive.org/web/20201022042445/https://www.novelguide.com/jane-eyre/index.html
Adam Bede https://web.archive.org/web/20210213005530/https://www.novelguide.com/adam-bede
Madame Bovary https://web.archive.org/web/20210212220253/https://www.novelguide.com/madame-bovary/index
Main Street https://web.archive.org/web/20210212220253/https://www.novelguide.com/main-street/index.html
Oliver Twist https://web.archive.org/web/20201204103600/https://www.novelguide.com/oliver-twist/index.html
Vanity Fair https://web.archive.org/web/20210212235930/https://www.novelguide.com/vanity-fair/index.html
Winesburg, Ohio https://web.archive.org/web/20210228134637/https://www.novelguide.com/winesburg-ohio/index.html
Little Women https://web.archive.org/web/20201204111055/https://www.novelguide.com/little-women/index.html
Babbitt https://web.archive.org/web/20201020010735/https://www.novelguide.com/babbitt
A Tale of Two Cities https://web.archive.org/web/20210213005530/https://www.novelguide.com/a-tale-of-two-cities
Emma https://web.archive.org/web/20210215030716/https://www.novelguide.com/emma
Persuasion https://web.archive.org/web/20210211232247/https://www.novelguide.com/persuasion/index.html
Pride and Prejudice https://web.archive.org/web/20210211232247/https://www.novelguide.com/pride-and-prejudice/index.html
Wuthering Heights https://web.archive.org/web/20210228134637/https://www.novelguide.com/wuthering-heights/index.html
Frankenstein https://web.archive.org/web/20201103045617/https://www.novelguide.com/frankenstein
Middlemarch https://web.archive.org/web/20210212220253/https://www.novelguide.com/middlemarch
Siddhartha https://web.archive.org/web/20210212220225/https://www.novelguide.com/siddhartha
Ivanhoe https://web.archive.org/web/20201205184924/https://www.novelguide.com/ivanhoe
David Copperfield https://web.archive.org/web/20210212220747/https://www.novelguide.com/david-copperfield/index
Sister Carrie https://web.archive.org/web/20210212220225/https://www.novelguide.com/sister-carrie/index.html
Kidnapped https://web.archive.org/web/20210228115510/https://www.novelguide.com/kidnapped
Dr. Jekyll and Mr. Hyde https://web.archive.org/web/20210212220747/https://www.novelguide.com/dr-jekyll-and-mr-hyde/index.html
Candide https://web.archive.org/web/20210212220448/https://www.novelguide.com/candide/index
Hamlet https://web.archive.org/web/20201204111334/https://www.novelguide.com/hamlet
A Midsummer Night's Dream https://web.archive.org/web/20210213005530/https://www.novelguide.com/a-midsummer-nights-dream
Othello https://web.archive.org/web/20201204103600/https://www.novelguide.com/othello
A Room With a View https://web.archive.org/web/20210213005530/https://www.novelguide.com/a-room-with-a-view/index
Coriolanus https://web.archive.org/web/20210212220448/https://www.novelguide.com/coriolanus/index
Around the World in Eighty Days https://web.archive.org/web/20200109120617/http://www.novelguide.com/around-the-world-in-eighty-days
Heart of Darkness https://web.archive.org/web/20201204111334/https://www.novelguide.com/heart-of-darkness
Notes from the Underground https://web.archive.org/web/20201022042315/https://www.novelguide.com/notes-from-the-underground
Maggie A Girl of the Streets https://web.archive.org/web/20210212220253/https://www.novelguide.com/maggie-a-girl-of-the-streets
Henry VIII https://web.archive.org/web/20201204111334/https://www.novelguide.com/henry-viii/index.html
Henry IV Part 2 https://web.archive.org/web/20201204111334/https://www.novelguide.com/henry-iv-part2
Macbeth https://web.archive.org/web/20210212220253/https://www.novelguide.com/macbeth
Henry IV Part 1 https://web.archive.org/web/20201204111334/https://www.novelguide.com/henry-iv-part-1/index.html
Henry VI Part 1 https://web.archive.org/web/20201204111334/https://www.novelguide.com/henry-vi-part-1/index.html
Richard III https://web.archive.org/web/20210213004944/https://www.novelguide.com/richard-iii/introduction
Henry V https://web.archive.org/web/20201204111334/https://www.novelguide.com/henry-v/index.html
Julius Caesar https://web.archive.org/web/20201022042445/https://www.novelguide.com/julius-caesar/index.html
Pygmalion https://web.archive.org/web/20210211232247/https://www.novelguide.com/pygmalion/introduction
A Doll's House https://web.archive.org/web/20210213005530/https://www.novelguide.com/a-dolls-house
Hedda Gabler https://web.archive.org/web/20201204111334/https://www.novelguide.com/hedda-gabler/index.html
Daisy Miller https://web.archive.org/web/20210212220747/https://www.novelguide.com/daisy-miller/index.html
An Enemy of the People https://web.archive.org/web/20200109120617/http://www.novelguide.com/an-enemy-of-the-people
Alice in Wonderland https://web.archive.org/web/20200109120617/http://www.novelguide.com/alice-in-wonderland/index
White Fang https://web.archive.org/web/20210228134637/https://www.novelguide.com/white-fang/index.html
Cymbeline https://web.archive.org/web/20210212220448/https://www.novelguide.com/cymbeline
All's Well That Ends Well https://web.archive.org/web/20200109120617/http://www.novelguide.com/alls-well-that-ends-well
Cyrano de Bergerac https://web.archive.org/web/20210212220448/https://www.novelguide.com/cyrano-de-bergerac/index
King Lear https://web.archive.org/web/20210228115510/https://www.novelguide.com/king-lear/index.html
Under the Greenwood Tree https://web.archive.org/web/20210214151736/https://www.novelguide.com/under-the-greenwood-tree
Of Human Bondage https://web.archive.org/web/20201204103600/https://www.novelguide.com/of-human-bondage
Beowulf https://web.archive.org/web/20201020010735/https://www.novelguide.com/beowulf
Anthem https://web.archive.org/web/20200109120617/http://www.novelguide.com/anthem
Ethan Frome https://web.archive.org/web/20210215030716/https://www.novelguide.com/ethan-frome/index.html
Invisible Man https://web.archive.org/web/20201205184924/https://www.novelguide.com/invisible-man/index.html
Much Ado About Nothing https://web.archive.org/web/20210212220253/https://www.novelguide.com/much-ado-about-nothing/index.html
Antony And Cleopatra https://web.archive.org/web/20200109120617/http://www.novelguide.com/antony-and-cleopatra/index
================================================
FILE: scripts/data_collection/pinkmonkey/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
from unidecode import unidecode
import time
from urllib.error import HTTPError, URLError
# PARAMS
SUMMARY_DIR = '../../raw_summaries/pinkmonkey/summaries'
# Summary list info
summary_list_file = 'literature_links.tsv.pruned'
#Always create a new errors file when starting to run the script
f_errors = open("section_errors.txt","w")
# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def chapter_section_check(link_text_lower, link_text_not_lower):
return 'chapter' in link_text_lower or 'scene' in link_text_lower\
or 'Act' in link_text_not_lower or 'part' in link_text_lower or 'prologue' in link_text_lower or 'epilogue' in link_text_lower\
or 'story' in link_text_lower or 'preface' in link_text_lower or 'Section' in link_text_not_lower
def remove_toc(text):
pat = '((.*)(table[ ]{1,}of contents.*))'
if re.match(pat, text, re.IGNORECASE):
to_replace = re.match(pat, text, re.IGNORECASE).group(3)
text = text.replace(to_replace, "")
return text
def get_overview_paragraphs(overview, specific_summary_dir):
overview_paragraphs = []
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
except Exception as e:
print (e)
time.sleep(4)
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
except Exception as e:
print ("Overview not found: ", e, overview)
# with open("section_errors.txt","a") as f:
f_errors.write(overview + "\t" + "Overview" + "\t" + specific_summary_dir + "\n")
return overview_paragraphs
flag = 0
pat = "(.*\(synopsis\))"
paragraphs = soup.findAll(["p","h3"])
iframe_text = "Your browser does not support the IFRAME tag."
for ix, paragraph in enumerate(paragraphs):
overview_text = paragraph.text.strip().replace(iframe_text, "").replace("\r\n"," ").replace("\n"," ")
if re.match(pat, overview_text, re.IGNORECASE):
break
if re.match(pat, overview_text, re.IGNORECASE):
to_replace = re.match(pat, overview_text, re.IGNORECASE).group(1)
overview_text = overview_text.replace(to_replace, "")
overview_text = remove_toc(overview_text)
overview_text = unidecode(overview_text)
overview_text = ". ".join([line.strip().rstrip() for line in overview_text.split('. ')])
return overview_text
def save_section_para(section_text, section_title, section_link, specific_summary_dir, index):
section_text = remove_toc(section_text)
section_text = remove_toc(section_text)
section_dict = {}
section_dict["name"] = section_title
section_dict["summary"] = section_text
section_dict["analysis"] = ""
section_dict["url"] = section_link
output_fname = os.path.join(specific_summary_dir, 'section_%d.txt' % index)
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(section_dict, fp)
def get_section_paragraphs(page_url, specific_summary_dir):
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
section_paragraphs = []
all_links = []
section_links = []
flag = 0
one_level_up_url = os.path.dirname(page_url)
all_links = soup.findAll("a")
overview_exists = 0
for link in all_links:
link_text_not_lower = link.text.strip().replace("\r\n"," ").replace("\n"," ")
link_text_lower = link.text.strip().lower().replace("\r\n"," ").replace("\n"," ")
if "summaries" in link_text_lower or 'synopsis' in link_text_lower or 'plot' in link_text_lower or chapter_section_check(link_text_lower, link_text_not_lower):
section_path = os.path.join(one_level_up_url, link.get("href"))
section_links.append((link.text.strip().rstrip(), section_path))
if 'synopsis' in link_text_lower or 'plot' in link_text_lower:
overview_exists = 1
overview_found = 0
index = -1
for link_text, link in section_links:
link_text = link_text.replace("\r\n"," ").replace("\n"," ")
link_text_lower = link_text.strip().rstrip().lower().replace("\r\n"," ").replace("\n"," ")
link_text_not_lower = link_text.strip().rstrip().replace("\r\n"," ").replace("\n"," ")
#Fetch overview first
if overview_exists and ('synopsis' in link_text_lower or 'plot' in link_text_lower) and overview_found == 0:
overview = link
overview_title = link_text
print (overview_title, overview)
overview_text = get_overview_paragraphs(overview, specific_summary_dir)
overview_dict = {}
overview_dict["name"] = "overview"
overview_dict["summary"] = overview_text
overview_dict["analysis"] = ""
overview_dict["url"] = overview
output_fname = os.path.join(specific_summary_dir, "overview.json")
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(overview_dict, fp)
overview_found = 1
continue
if (overview_found == 1 or not overview_exists) and chapter_section_check(link_text_lower, link_text_not_lower):
chapter_url = link
print(link_text, chapter_url)
index += 1
try:
chapter_soup = BeautifulSoup(urllib.request.urlopen(chapter_url), "html.parser")
except URLError as err:
print (err, "Retrying after sleep")
time.sleep(10)
try:
chapter_soup = BeautifulSoup(urllib.request.urlopen(chapter_url), "html.parser")
except Exception as e:
print (chapter_url, e)
f_errors.write(chapter_url + "\t" + str(e))
f_errors.write("\n")
continue
except Exception as e:
print (e)
time.sleep(4)
try:
chapter_soup = BeautifulSoup(urllib.request.urlopen(chapter_url), "html.parser")
except Exception as e:
print ("Chapter not found: ", e, chapter_url)
# with open("section_errors.txt","a") as f:
f_errors.write(str(index) + "\t" + chapter_url + "\t" + link_text + "\t" + specific_summary_dir + "\n")
continue
chapter_paras = chapter_soup.findAll(["p", "h3"])
iframe_text = "Your browser does not support the IFRAME tag."
section_text_paras = []
for ix, chapter_para in enumerate(chapter_paras):
try:
section_text = chapter_para.text.strip().replace(iframe_text, "").replace("\r\n"," ").replace("\n"," ")
section_text_paras.append(unidecode(section_text))
except Exception as e: # No text inside the para HTML
print ("Summary not found: ", e, chapter_url)
f_errors.write(str(index) + "\t" + chapter_url + "\t" + link_text + "\t" + specific_summary_dir + "\n")
continue
section_text = ' '.join(section_text_paras)
section_text = ". ".join([line.strip().rstrip() for line in section_text.split('. ')])
section_text = " ".join([word.strip() for word in section_text.split()])
# Remove obvious noise from the summary text
pat_toc = '.*?(Table of Contents(.*$))'
if re.match(pat_toc, section_text):
to_replace = re.match(pat_toc, section_text).group(1)
section_text = section_text.replace(to_replace, "")
section_text = section_text.replace("Help / FAQ", "").strip() # why no remove?
section_text = section_text.replace("Please Take our User Survey", "").strip() # why no remove?
section_title = link_text
# print ("section_text SAVED: ", section_text)
save_section_para(section_text, section_title, chapter_url, specific_summary_dir, index)
# For each summary info
for k, (title, page_url) in enumerate(summary_infos):
print('\n>>> {}. {} - {} <<<'.format(k, title, page_url))
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory, skipping.")
# continue
# Parse page
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except URLError as err:
print (err, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
f_errors.write(page_url + "\t" + str(e))
f_errors.write("\n")
continue
except Exception as e:
print ("page not found: ", e)
continue
get_section_paragraphs(page_url, specific_summary_dir)
================================================
FILE: scripts/data_collection/pinkmonkey/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, string
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
import time
# Book summaries from barronsbooknotes redirect to pinkmonkey
MAIN_SITE = 'https://web.archive.org/web/20180820042551/http://barronsbooknotes.com/'
SEED_URL = 'https://web.archive.org/web/20180820042551/http://barronsbooknotes.com/'
alphabet_list = string.ascii_lowercase
errors_file = open("link_errors.txt","w")
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
collected = []
for char in alphabet_list:
books_page = seed_page + char + ".html"
try:
soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser")
except Exception as e:
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser")
except Exception as e:
print ("Skipping: ", books_page, e)
errors_file.write(books_page + "\t" + str(e) + "\n")
continue
items = soup.findAll("div", {"align": "left"})
books = items[0].findAll("a")
# # # Go over each section
for index, item in enumerate(books):
# Parse section to get bullet point text
try:
item_title = " ".join(item.text.strip().split())
item_url = [char for char in item.get("href")]
item_url[-5] = '2'
item_url = "".join(item_url)
if item_title != "":
print ("item_title: ", item_title)
print ("item_url: ", item_url.strip(), "\n")
scraped_links.append({
"title": " ".join(item_title.split()),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
except Exception as e:
print ("Link not found")
print (e)
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/pinkmonkey/literature_links.tsv.pruned
================================================
Portrait of a Lady https://web.archive.org/web/20180820033733/http://www.pinkmonkey.com/booknotes/monkeynotes/pmPortraitLady02.asp
Cymbeline https://web.archive.org/web/20180820053313/http://www.pinkmonkey.com/booknotes/monkeynotes/pmCymbeline02.asp
Man and Superman https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmManSuperman02.asp
Love's Labour's Lost https://web.archive.org/web/20180820051943/http://www.pinkmonkey.com/booknotes/monkeynotes/pmLovesLabours02.asp
Coriolanus https://web.archive.org/web/20180820053313/http://www.pinkmonkey.com/booknotes/monkeynotes/pmCoriolanus02.asp
Richard II https://web.archive.org/web/20180820044717/http://www.pinkmonkey.com/booknotes/monkeynotes/pmRichard02.asp
Wuthering Heights https://web.archive.org/web/20210416215230/http://www.pinkmonkey.com/booknotes/monkeynotes/pmWuthering02.asp
Frankenstein https://web.archive.org/web/20180820054109/http://www.pinkmonkey.com/booknotes/monkeynotes/pmFrankenstein02.asp
Winesburg, Ohio https://web.archive.org/web/20180820045946/http://www.pinkmonkey.com/booknotes/monkeynotes/pmWinesburg02.asp
Madame Bovary https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMadameBovary02.asp
Othello https://web.archive.org/web/20210416215230/http://www.pinkmonkey.com/booknotes/monkeynotes/pmOthello02.asp
Sense and Sensibility https://web.archive.org/web/20180820034609/http://www.pinkmonkey.com/booknotes/monkeynotes/pmSenseSensibility02.asp
Tess of the D'Urbervilles https://web.archive.org/web/20180820050202/http://www.pinkmonkey.com/booknotes/monkeynotes/pmTessD02.asp
Lord Jim https://web.archive.org/web/20180820051943/http://www.pinkmonkey.com/booknotes/monkeynotes/pmLordJim02.asp
Dracula https://web.archive.org/web/20180820041120/http://www.pinkmonkey.com/booknotes/monkeynotes/pmDracula02.asp
Jane Eyre https://web.archive.org/web/20180820051427/http://www.pinkmonkey.com/booknotes/monkeynotes/pmJaneEyre02.asp
Main Street https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMainStreet02.asp
Emma https://web.archive.org/web/20180820053844/http://www.pinkmonkey.com/booknotes/monkeynotes/pmEmma02.asp
Alice's Adventures In Wonderland https://web.archive.org/web/20180820042551/http://www.pinkmonkey.com/booknotes/monkeynotes/pmAlice02.asp
Jude the Obscure https://web.archive.org/web/20180820051427/http://www.pinkmonkey.com/booknotes/monkeynotes/pmJude02.asp
Middlemarch https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMiddle02.asp
Siddhartha https://web.archive.org/web/20180820034609/http://www.pinkmonkey.com/booknotes/monkeynotes/pmSiddhartha02.asp
Kidnapped https://web.archive.org/web/20180820035817/http://www.pinkmonkey.com/booknotes/monkeynotes/pmKidnapped02.asp
Dr. Jekyll and Mr. Hyde https://web.archive.org/web/20180820041120/http://www.pinkmonkey.com/booknotes/monkeynotes/pmJekyll02.asp
My Antonia https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMyAntonia02.asp
Hamlet https://web.archive.org/web/20180820040441/http://www.pinkmonkey.com/booknotes/monkeynotes/pmHamlet02.asp
A Midsummer Night's Dream https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMidsummer02.asp
Around the World in Eighty Days https://web.archive.org/web/20180820042551/http://www.pinkmonkey.com/booknotes/monkeynotes/pmAroundWorld02.asp
Heart of Darkness https://web.archive.org/web/20180820040441/http://www.pinkmonkey.com/booknotes/monkeynotes/pmHeartDarkness02.asp
Typee https://web.archive.org/web/20180820050202/http://www.pinkmonkey.com/booknotes/monkeynotes/pmTypee02.asp
Maggie: A Girl of the Streets https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMaggie02.asp
Henry VIII https://web.archive.org/web/20180820040441/http://www.pinkmonkey.com/booknotes/monkeynotes/pmHenry802.asp
Macbeth https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMacbeth02.asp
Troilus and Cressida https://web.archive.org/web/20180820050202/http://www.pinkmonkey.com/booknotes/monkeynotes/pmTroilus02.asp
Henry IV, Part 1 https://web.archive.org/web/20180820040441/http://www.pinkmonkey.com/booknotes/monkeynotes/pmHenry4102.asp
Titus Andronicus https://web.archive.org/web/20180820050202/http://www.pinkmonkey.com/booknotes/monkeynotes/pmTitus02.asp
Julius Caesar https://web.archive.org/web/20180820051427/http://www.pinkmonkey.com/booknotes/monkeynotes/pmJuliusCaesar02.asp
Pygmalion https://web.archive.org/web/20180820033733/http://www.pinkmonkey.com/booknotes/monkeynotes/pmPygmalion02.asp
Arms and the Man https://web.archive.org/web/20180820042551/http://www.pinkmonkey.com/booknotes/monkeynotes/pmArmsMan02.asp
Hedda Gabler https://web.archive.org/web/20180820040441/http://www.pinkmonkey.com/booknotes/monkeynotes/pmHeddaGabler02.asp
An Enemy of the People https://web.archive.org/web/20180820053844/http://www.pinkmonkey.com/booknotes/monkeynotes/pmEnemyPeople02.asp
All's Well That Ends Well https://web.archive.org/web/20180820042551/http://www.pinkmonkey.com/booknotes/monkeynotes/pmAllsWell02.asp
Major Barbara https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMajorBarbara02.asp
Cyrano De Bergerac https://web.archive.org/web/20180820053313/http://www.pinkmonkey.com/booknotes/monkeynotes/pmCyrano02.asp
King Lear https://web.archive.org/web/20180820035817/http://www.pinkmonkey.com/booknotes/monkeynotes/pmKingLear02.asp
Return of the Native https://web.archive.org/web/20180820044717/http://www.pinkmonkey.com/booknotes/monkeynotes/pmReturnNative02.asp
Notes From Underground https://web.archive.org/web/20180820034941/http://www.pinkmonkey.com/booknotes/monkeynotes/pmNotesFrom02.asp
Under the Greenwood Tree https://web.archive.org/web/20180820032827/http://www.pinkmonkey.com/booknotes/monkeynotes/pmGreenwood02.asp
Of Human Bondage https://web.archive.org/web/20180820052247/http://www.pinkmonkey.com/booknotes/monkeynotes/pmOfHuman02.asp
Ethan Frome https://web.archive.org/web/20180820053844/http://www.pinkmonkey.com/booknotes/monkeynotes/pmEthanFrome02.asp
Timon of Athens https://web.archive.org/web/20180820050202/http://www.pinkmonkey.com/booknotes/monkeynotes/pmTimon02.asp
Much Ado About Nothing https://web.archive.org/web/20180820035130/http://www.pinkmonkey.com/booknotes/monkeynotes/pmMuchado02.asp
Vanity Fair https://web.archive.org/web/20180820033155/http://www.pinkmonkey.com/booknotes/monkeynotes/pmVanity02.asp
================================================
FILE: scripts/data_collection/shmoop/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json
import urllib.request, urllib.error, urllib.parse
from urllib.error import HTTPError, URLError
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
import time
from multiprocessing import Pool
# PARAMS
SUMMARY_DIR = '../../raw_summaries/shmoop/summaries'
MAIN_SITE = 'https://web.archive.org/web/20210225092515/https://www.shmoop.com/'
# Summary list info
summary_list_file = 'literature_links.tsv.pruned'
errors_file = open("section_errors.txt","w")
def wrap_data(name, summary, analysis, url):
return {
"name": name,
"summary": summary,
"analysis": analysis,
"url": url
}
def get_summary(summary_infos):
print ("summary_infos: ", summary_infos)
# For each summary info
# for k, (title, url) in enumerate(summary_infos):
title = summary_infos[0]
url = summary_infos[1]
print('\n>>> {} <<<'.format(title))
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory, skipping.")
# continue
# Parse page
html_address = urllib.parse.urljoin(url + "/", "summary")
try:
soup = BeautifulSoup(urllib.request.urlopen(html_address), "html.parser")
except Exception as e:
time.sleep(5)
try:
soup = BeautifulSoup(urllib.request.urlopen(html_address), "html.parser")
except Exception as e:
print (html_address, e)
errors_file.write(html_address + "\t" + str(e))
errors_file.write("\n")
return
# Parse general summary
overview_section = soup.find("div", {"data-class": "SHPlotOverviewSection"})
overview_section = soup.find("div", {"class": "content-wrapper"})
overview_summary_paragraphs = [paragraph.text.strip() for paragraph in overview_section.findAll("p")]
overview_summary = "".join(overview_summary_paragraphs)
overview_data = wrap_data("Overview", overview_summary, None, str(html_address))
output_fname = os.path.join(specific_summary_dir, 'overview.txt')
with open(output_fname, 'w', encoding="utf-8") as f:
f.write(json.dumps(overview_data))
# Parse sections summary
summary_sections = [(link.text, urllib.parse.urljoin(MAIN_SITE, link.get("href"))) for link in soup.find("div", {"class": "nav-menu"}).findAll("a", href=True) if "summary" in link.get("href")]
# Go over each section
for index, (section_title, section_url) in enumerate(summary_sections):
output_fname = os.path.join(specific_summary_dir, "section_%d.txt" % index)
print (section_title, section_url)
# Parse section to get bullet point text
try:
soup = BeautifulSoup(urllib.request.urlopen(section_url), "html.parser")
except URLError as err:
print (err, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(section_url), "html.parser")
except Exception as e:
print (section_url, e)
errors_file.write(section_url + "\t" + str(e))
errors_file.write("\n")
continue
except Exception as e:
print (section_url, e)
errors_file.write(section_url + "\t" + str(e))
errors_file.write("\n")
continue
try:
section_points = soup.find("div", {"data-element": "collapse_target"})
section_text = "".join([bullet.text.strip() for bullet in section_points.findAll("li")])
# Try alternate
if section_text == '':
section_text = "".join([bullet.text.strip() for bullet in section_points.findAll("p")])
section_data = wrap_data(section_title, section_text, None, str(section_url))
# Save in a file
with open(output_fname, 'w', encoding="utf-8") as f:
f.write(json.dumps(section_data))
print ("Saved to file")
except Exception as e:
print (section_url, e)
errors_file.write(section_url + "\t" + str(e))
errors_file.write("\n")
continue
# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)
with Pool(1) as p:
p.map(get_summary, summary_infos)
================================================
FILE: scripts/data_collection/shmoop/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210225092515/https://www.shmoop.com/study-guides'
errors_file = open("link_errors.txt","w")
def generate_page_links(base_url, category_name, max_pages):
return [os.path.join(base_url, category_name, "index?p=%d" % page_id) for page_id in range(1, max_pages+1)]
def scrape_index_pages(links):
# For each summary info
error_files, error_titles = [], []
scraped_links = []
for k, page_url in enumerate(links):
print('>>> {}. {} <<<'.format(k, page_url))
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print ("Skipping: ", page_url)
errors_file.write(page_url + "\t" + str(e) + "\n")
continue
items = soup.findAll("div", {"class" : "item"})
print("Found %d items." % len(items))
# Go over each section
for index, item in enumerate(items):
# Parse section to get bullet point text
item_title = item.find("div", {"class": "item-info"}).text
item_url = item.find("a", {"class": "details"}).get("href")
scraped_links.append({
"title": item_title.strip(),
"url": item_url.strip()
})
return scraped_links
# generate literature links
works_list = generate_page_links(MAIN_SITE, "literature", 95)
scraped_data = scrape_index_pages(works_list)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/shmoop/literature_links.tsv.pruned
================================================
Romeo and Juliet https://web.archive.org/web/20210624200133/https://www.shmoop.com/study-guides/literature/romeo-and-juliet
Julius Caesar https://web.archive.org/web/20210624193711/https://www.shmoop.com/study-guides/literature/julius-caesar
Ivanhoe https://web.archive.org/web/20210624193654/https://www.shmoop.com/study-guides/literature/ivanhoe
The Scarlet Letter https://web.archive.org/web/20210624201606/https://www.shmoop.com/study-guides/literature/scarlet-letter
The Importance of Being Earnest https://web.archive.org/web/20210624201341/https://www.shmoop.com/study-guides/literature/importance-of-being-earnest
Howards End https://web.archive.org/web/20210624193615/https://www.shmoop.com/study-guides/literature/howards-end
White Fang https://web.archive.org/web/20200929150717/https://www.shmoop.com/study-guides/literature/white-fang
Coriolanus https://web.archive.org/web/20200928191345/https://www.shmoop.com/study-guides/literature/coriolanus
The Merry Wives of Windsor https://web.archive.org/web/20210624201500/https://www.shmoop.com/study-guides/literature/merry-wives-of-windsor
A Doll's House https://web.archive.org/web/20210304225252/https://www.shmoop.com/study-guides/literature/dolls-house
Sons and Lovers https://web.archive.org/web/20210624200850/https://www.shmoop.com/study-guides/literature/sons-and-lovers
Othello https://web.archive.org/web/20210624194547/https://www.shmoop.com/study-guides/literature/othello
Titus Andronicus https://web.archive.org/web/20210624201826/https://www.shmoop.com/study-guides/literature/titus-andronicus-shakespeare
Love's Labour's Lost https://web.archive.org/web/20210624193742/https://www.shmoop.com/study-guides/literature/loves-labours-lost
Emma https://web.archive.org/web/20201127211107/https://www.shmoop.com/study-guides/literature/emma
An Enquiry Concerning the Principles of Morals https://web.archive.org/web/20200928194100/https://www.shmoop.com/study-guides/literature/enquiry-concerning-the-principles-of-morals
Kidnapped https://web.archive.org/web/20210624193722/https://www.shmoop.com/study-guides/literature/kidnapped
The Goose Girl https://web.archive.org/web/20210624201228/https://www.shmoop.com/study-guides/literature/the-goose-girl
The Wonderful Wizard of Oz https://web.archive.org/web/20210624201753/https://www.shmoop.com/study-guides/literature/wonderful-wizard-of-oz-book
The Spanish Tragedy https://web.archive.org/web/20210624201644/https://www.shmoop.com/study-guides/literature/the-spanish-tragedy
The Mysteries of Udolpho https://web.archive.org/web/20210624201500/https://www.shmoop.com/study-guides/literature/mysteries-of-udolpho
Adam Bede https://web.archive.org/web/20201125114409/https://www.shmoop.com/study-guides/literature/adam-bede
Tess of the D'Urbervilles https://web.archive.org/web/20210624200938/https://www.shmoop.com/study-guides/literature/tess-of-the-durbervilles
The Prince https://web.archive.org/web/20210624201604/https://www.shmoop.com/study-guides/literature/prince-machiavelli
Sense and Sensibility https://web.archive.org/web/20210624200213/https://www.shmoop.com/study-guides/literature/sense-and-sensibility
Far From the Madding Crowd https://web.archive.org/web/20210624192652/https://www.shmoop.com/study-guides/literature/far-from-the-madding-crowd
The Brothers Karamazov https://web.archive.org/web/20210624201022/https://www.shmoop.com/study-guides/literature/brothers-karamazov
The Picture of Dorian Gray https://web.archive.org/web/20210624201544/https://www.shmoop.com/study-guides/literature/picture-dorian-gray
Lord Jim https://web.archive.org/web/20210624193742/https://www.shmoop.com/study-guides/literature/lord-jim
Twelfth Night, or What You Will https://web.archive.org/web/20210624202234/https://www.shmoop.com/study-guides/literature/twelfth-night
The Red and the Black https://web.archive.org/web/20210624201556/https://www.shmoop.com/study-guides/literature/red-and-the-black
The Comedy of Errors https://web.archive.org/web/20210624201036/https://www.shmoop.com/study-guides/literature/comedy-of-errors
The White Devil https://web.archive.org/web/20210624201726/https://www.shmoop.com/study-guides/literature/the-white-devil
The Tempest https://web.archive.org/web/20210624201650/https://www.shmoop.com/study-guides/literature/tempest
The Tragedy of Antony and Cleopatra https://web.archive.org/web/20210624201736/https://www.shmoop.com/study-guides/literature/antony-cleopatra
Uncle Vanya https://web.archive.org/web/20210624202234/https://www.shmoop.com/study-guides/literature/uncle-vanya
The Portrait of a Lady https://web.archive.org/web/20210624201604/https://www.shmoop.com/study-guides/literature/portrait-of-a-lady
Dracula https://web.archive.org/web/20201127213435/https://www.shmoop.com/study-guides/literature/dracula
Jane Eyre https://web.archive.org/web/20210624193654/https://www.shmoop.com/study-guides/literature/jane-eyre
The Hound of the Baskervilles https://web.archive.org/web/20210624201316/https://www.shmoop.com/study-guides/literature/hound-of-the-baskervilles
Madame Bovary https://web.archive.org/web/20210624193757/https://www.shmoop.com/study-guides/literature/madame-bovary
The Aeneid https://web.archive.org/web/20210624201011/https://www.shmoop.com/study-guides/literature/aeneid
Main Street https://web.archive.org/web/20210624193757/https://www.shmoop.com/study-guides/literature/main-street
Oliver Twist https://web.archive.org/web/20210624193914/https://www.shmoop.com/study-guides/literature/oliver-twist
The House of the Seven Gables https://web.archive.org/web/20210624201316/https://www.shmoop.com/study-guides/literature/house-seven-gables
Vanity Fair https://web.archive.org/web/20200929144928/https://www.shmoop.com/study-guides/literature/vanity-fair-thackeray
Little Women https://web.archive.org/web/20210624193802/https://www.shmoop.com/study-guides/literature/little-women
Babbitt https://web.archive.org/web/20210116190210/https://www.shmoop.com/study-guides/literature/babbitt
A Tale of Two Cities https://web.archive.org/web/20201205220634/https://www.shmoop.com/study-guides/literature/tale-of-two-cities
The House of Mirth https://web.archive.org/web/20210624201316/https://www.shmoop.com/study-guides/literature/house-of-mirth
Persuasion https://web.archive.org/web/20210624194445/https://www.shmoop.com/study-guides/literature/persuasion
Gulliver's Travels https://web.archive.org/web/20210624193541/https://www.shmoop.com/study-guides/literature/gullivers-travels
Wuthering Heights https://web.archive.org/web/20200919013354/https://www.shmoop.com/study-guides/literature/wuthering-heights
Frankenstein https://web.archive.org/web/20210624192730/https://www.shmoop.com/study-guides/literature/frankenstein
Mansfield Park https://web.archive.org/web/20210624193757/https://www.shmoop.com/study-guides/literature/mansfield-park
Middlemarch https://web.archive.org/web/20210624193915/https://www.shmoop.com/study-guides/literature/middlemarch
Siddhartha https://web.archive.org/web/20210624200823/https://www.shmoop.com/study-guides/literature/siddhartha
Treasure Island https://web.archive.org/web/20210624201815/https://www.shmoop.com/study-guides/literature/treasure-island-book
David Copperfield https://web.archive.org/web/20201021164515/https://www.shmoop.com/study-guides/literature/david-copperfield
Sister Carrie https://web.archive.org/web/20210624200857/https://www.shmoop.com/study-guides/literature/sister-carrie
The Turn of the Screw https://web.archive.org/web/20210624201736/https://www.shmoop.com/study-guides/literature/turn-of-the-screw
Candide https://web.archive.org/web/20201026164007/https://www.shmoop.com/study-guides/literature/candide
Paradise Lost https://web.archive.org/web/20210624193940/https://www.shmoop.com/study-guides/literature/paradise-lost
Northanger Abbey https://web.archive.org/web/20210624193925/https://www.shmoop.com/study-guides/literature/northanger-abbey
My Antonia https://web.archive.org/web/20210624193939/https://www.shmoop.com/study-guides/literature/my-antonia
A Vindication of the Rights of Woman https://web.archive.org/web/20201205220634/https://www.shmoop.com/study-guides/literature/a-vindication-of-the-rights-of-woman
The Life of Timon of Athens https://web.archive.org/web/20210624201506/https://www.shmoop.com/study-guides/literature/timon-of-athens
Hamlet https://web.archive.org/web/20210624193541/https://www.shmoop.com/study-guides/literature/hamlet
A Midsummer Night's Dream https://web.archive.org/web/20210304230026/https://www.shmoop.com/study-guides/literature/midsummer-nights-dream
A Room with a View https://web.archive.org/web/20210304213804/https://www.shmoop.com/study-guides/literature/room-with-a-view
Around the World in Eighty Days https://web.archive.org/web/20210116190210/https://www.shmoop.com/study-guides/literature/around-the-world-in-eighty-days
What Maisie Knew https://web.archive.org/web/20210624202247/https://www.shmoop.com/study-guides/literature/what-maisie-knew
Where Angels Fear to Tread https://web.archive.org/web/20200929150717/https://www.shmoop.com/study-guides/literature/where-angels-fear-to-tread
Heart of Darkness https://web.archive.org/web/20210624193540/https://www.shmoop.com/study-guides/literature/heart-of-darkness
Leviathan https://web.archive.org/web/20210624193802/https://www.shmoop.com/study-guides/literature/leviathan
From the Earth to the Moon https://web.archive.org/web/20210624192803/https://www.shmoop.com/study-guides/literature/from-the-earth-to-the-moon
The Power and the Glory https://web.archive.org/web/20210624201604/https://www.shmoop.com/study-guides/literature/the-power-and-the-glory
Notes from the Underground https://web.archive.org/web/20210624193925/https://www.shmoop.com/study-guides/literature/notes-from-underground
The Return of the Native https://web.archive.org/web/20210624201543/https://www.shmoop.com/study-guides/literature/return-of-native
The Secret Agent https://web.archive.org/web/20210624201606/https://www.shmoop.com/study-guides/literature/the-secret-agent
The Confidence-Man https://web.archive.org/web/20210624201036/https://www.shmoop.com/study-guides/literature/confidence-man
Typee https://web.archive.org/web/20210624202234/https://www.shmoop.com/study-guides/literature/typee
Anne of Green Gables https://web.archive.org/web/20200928194643/https://www.shmoop.com/study-guides/literature/anne-of-green-gables
A Little Princess https://web.archive.org/web/20210304230026/https://www.shmoop.com/study-guides/literature/little-princess
Maggie: A Girl of the Streets https://web.archive.org/web/20210624193757/https://www.shmoop.com/study-guides/literature/maggie-a-girl-of-the-streets
The Monkey's Paw https://web.archive.org/web/20210624201500/https://www.shmoop.com/study-guides/literature/monkeys-paw
The Red Badge of Courage https://web.archive.org/web/20210624201556/https://www.shmoop.com/study-guides/literature/red-badge-of-courage
O Pioneers! https://web.archive.org/web/20210624193914/https://www.shmoop.com/study-guides/literature/o-pioneers
The Mill on the Floss https://web.archive.org/web/20210624201500/https://www.shmoop.com/study-guides/literature/mill-on-the-floss
Henry VIII https://web.archive.org/web/20210624193614/https://www.shmoop.com/study-guides/literature/henry-viii
The Time Machine https://web.archive.org/web/20210624201736/https://www.shmoop.com/study-guides/literature/time-machine-hg-wells
Kim https://web.archive.org/web/20210624193722/https://www.shmoop.com/study-guides/literature/kim-rudyard-kipling
The Invisible Man https://web.archive.org/web/20210624201403/https://www.shmoop.com/study-guides/literature/invisible-man-wells
Little Dorrit https://web.archive.org/web/20210624193802/https://www.shmoop.com/study-guides/literature/little-dorrit
Incidents in the Life of a Slave Girl https://web.archive.org/web/20210624193640/https://www.shmoop.com/study-guides/literature/incidents-life-slave-girl
Henry IV Part 2 https://web.archive.org/web/20210624193540/https://www.shmoop.com/study-guides/literature/henry-iv-part-2
Macbeth https://web.archive.org/web/20210624193757/https://www.shmoop.com/study-guides/literature/macbeth
Troilus and Cressida https://web.archive.org/web/20210624201815/https://www.shmoop.com/study-guides/literature/troilus-cressida
Richard II https://web.archive.org/web/20210624200133/https://www.shmoop.com/study-guides/literature/richard-ii
Henry IV Part 1 https://web.archive.org/web/20210624193540/https://www.shmoop.com/study-guides/literature/henry-iv-part-1
King John https://web.archive.org/web/20210624193722/https://www.shmoop.com/study-guides/literature/king-john
Henry VI Part 1 https://web.archive.org/web/20210624193540/https://www.shmoop.com/study-guides/literature/henry-vi-part-1
Richard III https://web.archive.org/web/20210624200133/https://www.shmoop.com/study-guides/literature/richard-iii
Henry V https://web.archive.org/web/20210624193540/https://www.shmoop.com/study-guides/literature/henry-v
The Two Gentlemen of Verona https://web.archive.org/web/20210624201802/https://www.shmoop.com/study-guides/literature/gentlemen-of-verona
Pygmalion https://web.archive.org/web/20210624200145/https://www.shmoop.com/study-guides/literature/pygmalion
The Boxcar Children https://web.archive.org/web/20210624201022/https://www.shmoop.com/study-guides/literature/boxcar-children
The Piazza Tales https://web.archive.org/web/20210624201544/https://www.shmoop.com/study-guides/literature/the-piazza-tales
The Return of Sherlock Holmes https://web.archive.org/web/20210624201543/https://www.shmoop.com/study-guides/literature/return-of-sherlock-holmes
Ghosts https://web.archive.org/web/20210624192803/https://www.shmoop.com/study-guides/literature/ghosts-ibsen
Hedda Gabler https://web.archive.org/web/20210624193540/https://www.shmoop.com/study-guides/literature/hedda-gabler
Daisy Miller https://web.archive.org/web/20210624192036/https://www.shmoop.com/study-guides/literature/daisy-miller
Tartuffe https://web.archive.org/web/20210624200938/https://www.shmoop.com/study-guides/literature/tartuffe
An Enemy of the People https://web.archive.org/web/20200928194100/https://www.shmoop.com/study-guides/literature/enemy-of-the-people
Measure for Measure https://web.archive.org/web/20210624193815/https://www.shmoop.com/study-guides/literature/measure-for-measure
The Seagull https://web.archive.org/web/20210624201606/https://www.shmoop.com/study-guides/literature/the-seagull
Idylls of the King https://web.archive.org/web/20210624193611/https://www.shmoop.com/study-guides/literature/idylls-of-the-king
The Merchant of Venice https://web.archive.org/web/20210624201500/https://www.shmoop.com/study-guides/literature/merchant-of-venice
Meditations https://web.archive.org/web/20210624193815/https://www.shmoop.com/study-guides/literature/meditations
The Man in the Iron Mask https://web.archive.org/web/20210624201428/https://www.shmoop.com/study-guides/literature/man-in-the-iron-mask
All's Well That Ends Well https://web.archive.org/web/20201125115619/https://www.shmoop.com/study-guides/literature/alls-well-that-ends-well
Major Barbara https://web.archive.org/web/20210624193757/https://www.shmoop.com/study-guides/literature/major-barbara
The Communist Manifesto https://web.archive.org/web/20210624201036/https://www.shmoop.com/study-guides/literature/communist-manifesto
An Ideal Husband https://web.archive.org/web/20200928194100/https://www.shmoop.com/study-guides/literature/an-ideal-husband
The Three Musketeers https://web.archive.org/web/20210624201736/https://www.shmoop.com/study-guides/literature/three-musketeers
Black Beauty https://web.archive.org/web/20201125120012/https://www.shmoop.com/study-guides/literature/black-beauty
King Lear https://web.archive.org/web/20210624193722/https://www.shmoop.com/study-guides/literature/king-lear
Bleak House https://web.archive.org/web/20201026155119/https://www.shmoop.com/study-guides/literature/bleak-house
Anthem https://web.archive.org/web/20200928194643/https://www.shmoop.com/study-guides/literature/anthem-ayn-rand
Ethan Frome https://web.archive.org/web/20210624193104/https://www.shmoop.com/study-guides/literature/ethan-frome
The Jungle https://web.archive.org/web/20210624201403/https://www.shmoop.com/study-guides/literature/the-jungle
The Ambassadors https://web.archive.org/web/20210624201020/https://www.shmoop.com/study-guides/literature/the-ambassadors
The Age of Innocence https://web.archive.org/web/20210624201011/https://www.shmoop.com/study-guides/literature/the-age-of-innocence
Much Ado About Nothing https://web.archive.org/web/20210624193920/https://www.shmoop.com/study-guides/literature/much-ado-about-nothing
The Secret Garden https://web.archive.org/web/20210624201606/https://www.shmoop.com/study-guides/literature/the-secret-garden
The Duchess of Malfi https://web.archive.org/web/20210624201210/https://www.shmoop.com/study-guides/literature/the-duchess-of-malfi
The Taming of the Shrew https://web.archive.org/web/20210624201650/https://www.shmoop.com/study-guides/literature/taming-of-the-shrew
As You Like It https://web.archive.org/web/20210116190210/https://www.shmoop.com/study-guides/literature/as-you-like-it
================================================
FILE: scripts/data_collection/sparknotes/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
import time
from urllib.error import HTTPError, URLError
from multiprocessing import Pool
# PARAMS
SUMMARY_DIR = '../../raw_summaries/sparknotes/summaries'
# Summary list info
summary_list_file = "literature_links.tsv.pruned'
f_errors = open("section_errors.txt","w")
def wrap_data(name, summary, analysis, url):
return {
"name": name,
"summary": summary,
"analysis": analysis,
"url": url
}
# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)
# For each summary info
def get_summary(summary_info):
title = summary_info[0]
page_url = summary_info[1]
print('\n>>> {} <<<'.format(title))
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory, skipping.")
# Parse page
overview_url = urllib.parse.urljoin(page_url, "summary")
try:
soup = BeautifulSoup(urllib.request.urlopen(overview_url), "html.parser")
except Exception as e:
time.sleep(5)
try:
soup = BeautifulSoup(urllib.request.urlopen(overview_url), "html.parser")
except Exception as e:
print ("Overview error: " , overview_url, e)
f_errors.write(title + "\t" + overview_url + "\t" + str(e) + "\n")
# Parse general summary
overview_data = soup.find("div", {"id": "plotoverview"})
if overview_data:
overview_summary_paragraphs = [paragraph.text.strip().replace("\n", " ") for paragraph in overview_data.findAll("p")]
overview_summary = "\n".join(overview_summary_paragraphs)
overview_data = wrap_data("Overview", overview_summary, None, str(overview_url))
output_fname = os.path.join(specific_summary_dir, 'overview.txt')
with open(output_fname, 'w', encoding="utf-8") as f:
f.write(json.dumps(overview_data))
else:
print ("No Overview text found")
# Parse sections summary
print ("page_url: ", page_url)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
summary_section = soup.find("span", {"id": "Summary"}).find_parent("div")
except Exception as e:
time.sleep(5)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
summary_section = soup.find("span", {"id": "Summary"}).find_parent("div")
except Exception as e:
print (page_url, e)
f_errors.write(title + "\t" + page_url + "\t" + str(e) + "\n")
return
summary_links = summary_section.findAll("a")
summary_links = [link.get("href") for link in summary_links if "section" in link.get("href")]
for index, section in enumerate(summary_links):
print('Parsing section: {}'.format(index))
output_fname = os.path.join(specific_summary_dir, "section_%d.txt" % index)
section_url = urllib.parse.urljoin(page_url, section)
print ("section_url: ", section_url)
try:
soup = BeautifulSoup(urllib.request.urlopen(section_url), "html.parser")
except URLError as err:
print (err, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(section_url), "html.parser")
except Exception as e:
print (section_url, e)
f_errors.write(section_url + "\t" + str(e))
f_errors.write("\n")
continue
except Exception as e:
print ("Here:", section_url, e)
f_errors.write(str(index) + "\t" + section_url + "\t" + str(e) + "\n")
continue
subsection_links = soup.find("div", {"class": "interior-sticky-nav"})
if subsection_links == None:
subsection_links = soup.find("div", {"class": "pagination-links"})
# print ("subsection_links before: ", subsection_links)
num_subsections = max(1, len(subsection_links.findAll("a") if subsection_links else []))
subsection_links = ["page/%d/" % page_ix for page_ix in range(1, num_subsections+1)]
section_header = " ".join([x.strip() for x in soup.title.string.replace(" | SparkNotes", "").split(":")[1:]])
section_paragraphs = []
for subsection_link in subsection_links:
if subsection_link == 'page/1/':
subsection_link = ""
subsection_url = urllib.parse.urljoin(section_url, subsection_link)
print ("section_header and subsection_url: ", section_header, subsection_url)
try:
soup = BeautifulSoup(urllib.request.urlopen(subsection_url), "html.parser")
except URLError as err:
print (err, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(subsection_url), "html.parser")
except Exception as e:
print (subsection_url, e)
f_errors.write(subsection_url + "\t" + str(e))
f_errors.write("\n")
continue
except Exception as e:
print ("subsection error: ", subsection_url, e)
f_errors.write(str(index) + "\t" + section_header + "\t" + section_url + "\t" + subsection_url + "\t" + str(e) + "\n")
continue
subsection_data = soup.find("div", {"id": "section"})
if subsection_data == None:
# Try alternate way
subsection_data = soup.find("div", {"class": "mainTextContent"})
if subsection_data == None:
print ("subsection_data is None")
f_errors.write(str(index) + "\t" + section_header + "\t" + section_url + "\t" + subsection_url + "\t" + "No Data" + "\n")
continue
section_paragraphs.append(subsection_data.text.strip().replace("\n", " "))
if section_paragraphs == []:
continue
section_text = "".join(section_paragraphs)
if "Summary:" in section_text and "Analysis:" in section_text:
section_text_split = section_text.split("Analysis:")
summary_text = " ".join([summary for summary in section_text_split if "Summary:" in summary]).replace("Summary:", "").strip()
analysis_text = " ".join([analysis for analysis in section_text_split if "Summary:" not in analysis]).replace("Analysis:", "").strip()
else:
summary_text = section_text.replace("Summary:", "").strip()
analysis_text = None
section_data = wrap_data(section_header, summary_text, analysis_text, section_url)
print ("Saving to file: ", output_fname)
with open(output_fname, 'w', encoding="utf-8") as f:
f.write(json.dumps(section_data))
with Pool(1) as p:
p.map(get_summary, summary_infos)
================================================
FILE: scripts/data_collection/sparknotes/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210223175142/https://www.sparknotes.com/'
SEED_URL = 'https://web.archive.org/web/20210223175142/https://www.sparknotes.com/lit'
errors_file = open("link_errors.txt","w")
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
try:
soup = BeautifulSoup(urllib.request.urlopen(seed_page), "html.parser")
except Exception as e:
print ("Skipping: ", seed_page)
errors_file.write(seed_page + "\t" + str(e) + "\n")
return []
items = soup.findAll("li", {"class": "hub-AZ-list__card hub-AZ-list__card--byTitle"})
print("Found %d items." % len(items))
# Go over each section
for index, item in enumerate(items):
# Parse section to get bullet point text
item_title = item.find("a", href=True).text
item_url = item.find("a").get("href")
scraped_links.append({
"title": item_title.strip().replace(",",""),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/sparknotes/literature_links.tsv.pruned
================================================
The House of the Seven Gables https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/sevengables/
The Scarlet Letter https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/scarlet/
A Tale of Two Cities https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/a-tale-of-two-cities/
The Brothers Karamazov https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/brothersk/
Far from the Madding Crowd https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/maddingcrowd/
Frankenstein https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/frankenstein/
The Secret Garden https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/secretgarden/
The Taming of the Shrew https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/shrew/
Antony and Cleopatra https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/antony/
The Prince https://web.archive.org/web/20210223161114/https://www.sparknotes.com/philosophy/prince/
As You Like It https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/asyoulikeit/
Sense and Sensibility https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/sensibility/
The Picture of Dorian Gray https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/doriangray/
Lord Jim https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/lordjim/
The Red and the Black https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/redblack/
The Comedy of Errors https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/errors/
The Tempest https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/tempest/
Uncle Vanya https://web.archive.org/web/20210223161114/https://www.sparknotes.com/drama/unclevanya/
The Portrait of a Lady https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/portraitlady/
The Last of the Mohicans https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/mohicans/
Dracula https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/dracula/
Jane Eyre https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/janeeyre/
Adam Bede https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/adambede/
The Aeneid https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/aeneid/
Main Street https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/mainstreet/
Oliver Twist https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/oliver/
Little Women https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/littlewomen/
Babbitt https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/babbitt/
Sons and Lovers https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/sonsandlovers/
The House of Mirth https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/mirth/
Emma https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/emma/
Persuasion https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/persuasion/
Wuthering Heights https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/wuthering/
Mansfield Park https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/mansfieldpark/
Middlemarch https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/middlemarch/
Ivanhoe https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/ivanhoe/
David Copperfield https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/copperfield/
Sister Carrie https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/sistercarrie/
Kidnapped https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/kidnapped/
Dr. Jekyll and Mr. Hyde https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/jekyll/
The Turn of the Screw https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/screw/
Candide https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/candide/
Paradise Lost https://web.archive.org/web/20210223161114/https://www.sparknotes.com/poetry/paradiselost/
Northanger Abbey https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/northangerabbey/
My Antonia https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/antonia/
Hamlet https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/hamlet/
Othello https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/othello/
A Room with a View https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/room/
Coriolanus https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/coriolanus/
Looking Backward https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/lookingbackward/
Heart of Darkness https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/heart-of-darkness/
The Power and the Glory https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/powerglory/
The Return of the Native https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/returnofnative/
Howards End https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/howardsend/
Typee https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/typee/
Anne of Green Gables https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/anneofgreengables/
Regeneration https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/regeneration/
Maggie: A Girl of the Streets https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/maggie/
The Good Soldier https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/goodsoldier/
The Red Badge of Courage https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/redbadge/
The New Testament https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/newtestament/
O Pioneers! https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/opioneers/
Henry VIII https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/henryviii/
The Time Machine https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/timemachine/
Henry IV Part 2 https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/henry4pt2/
Macbeth https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/macbeth/
Troilus and Cressida https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/troilus/
Richard II https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/richardii/
Henry IV Part 1 https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/henry4pt1/
King John https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/kingjohn/
Henry VI Part 1 https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/henry6pt1/
Richard III https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/richardiii/
Titus Andronicus https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/titus/
Julius Caesar https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/juliuscaesar/
Love's Labours Lost https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/labours/
The Two Gentlemen of Verona https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/twogentlemen/
Pygmalion https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/pygmalion/
Twelfth Night https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/twelfthnight/
The Merry Wives of Windsor https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/merrywives/
An Enemy of the People https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/enemyofthepeople/
Measure for Measure https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/measure/
Alice in Wonderland https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/alice/
White Fang https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/fang/
Romeo and Juliet https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/romeojuliet/
The Merchant of Venice https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/merchant/
All's Well That Ends Well https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/allswell/
Major Barbara https://web.archive.org/web/20210223161114/https://www.sparknotes.com/drama/majorbarbara/
The Jew of Malta https://web.archive.org/web/20210223161114/https://www.sparknotes.com/drama/jewofmalta/
An Ideal Husband https://web.archive.org/web/20210223161114/https://www.sparknotes.com/drama/idealhusband/
The Three Musketeers https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/3musk/
Cyrano de Bergerac https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/cyrano/
The Winter's Tale https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/winterstale/
King Lear https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/lear/
Bleak House https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/bleakhouse/
The Adventures of Huckleberry Finn https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/huckfinn/
Anthem https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/anthem/
Ethan Frome https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/frome/
The Jungle https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/jungle/
The Age of Innocence https://web.archive.org/web/20210223161114/https://www.sparknotes.com/lit/ageofinnocence/
Timon of Athens https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/timonofathens/
Much Ado About Nothing https://web.archive.org/web/20210223161114/https://www.sparknotes.com/shakespeare/muchado/
================================================
FILE: scripts/data_collection/thebestnotes/get_summaries.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, json
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
from unidecode import unidecode
import argparse, string
import time
from urllib.error import HTTPError, URLError
# PARAMS
SUMMARY_DIR = '../../raw_summaries/thebestnotes/summaries'
MAIN_SITE = 'https://web.archive.org/web/20210111015641/http://thebestnotes.com/'
# Summary list info
summary_list_file = 'literature_links.tsv.pruned'
# Create a fresh file for the links that throw HTTP errors, so that we can try access them again
f_errors = open("section_errors.txt","w")
# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)
def unify_title(title):
title_lower = title.lower().strip()
title_clean = title_lower.translate(str.maketrans('', '', string.punctuation))
return title_clean
def get_overview_paragraphs(overview):
if 'https:/the' in overview:
overview = overview.replace('https:/the', 'https://the')
if 'http:/the' in overview:
overview = overview.replace('http:/the', 'http://the')
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
except URLError as err:
print (err, overview, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
except Exception as e:
print (overview, e)
f_errors.write(overview + "\t" + str(e))
f_errors.write("\n")
return
except Exception as e:
print (overview, e)
f_errors.write(overview + "\t" + str(e))
return []
overview_paragraphs = []
flag = 0
for paragraph in soup.findAll(["p", "h2", "h6"]):
if 'synopsis' in " ".join(paragraph.text.strip().lower().split()) and paragraph.name in ["h2", "h6"]:
flag = 1
continue
# continue collecting text from the rest of the p tags
if flag == 1 and "thebestnotes" not in paragraph.text.strip().lower() and (paragraph.name == 'p' or (paragraph.name in ["h2", "h6"] and paragraph.text.replace("\r\n","").strip() == "")):
if paragraph.text.strip() != "":
overview_paragraphs.append(unidecode(paragraph.text.replace("\r\n","").strip()))
else:
flag = 0
# end collecting the summary when the above conditions are not met
return overview_paragraphs
def get_section_paragraphs(section, section_titles, section_title_orig, specific_summary_dir, index):
book_name = os.path.basename(specific_summary_dir)
if 'https:/the' in section:
section = section.replace('https:/the', 'https://the')
if 'http:/the' in section:
section = section.replace('http:/the', 'http://the')
try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
except URLError as err:
print (err, section, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
except Exception as e:
print (section, e)
f_errors.write(section + "\t" + str(e))
f_errors.write("\n")
return []
except Exception as e:
print (section_title_orig, section, e)
f_errors.write(str(index+1) + "\t" + section + "\t" + str(section_titles) + "\t" + section_title_orig + "\t" + specific_summary_dir + "\n")
return []
section_paragraphs = []
flag = 0
found = False
# True if a structured page exists like - https://web.archive.org/web/20210111015641/http://thebestnotes.com/booknotes/Invisible_Man_Wells/The_Invisible_Man_Study_Guide14.html
structured_page = 0
if soup.findAll("div", {"class": "large-12 columns"}) != []:
structured_page = 1
for section_title in section_titles:
section_title = unify_title(section_title.strip())
if structured_page:
for paragraph in soup.findAll(["p", "h2", "h6"]):
if section_title in unify_title(" ".join(paragraph.text.strip().lower().split())) and paragraph.name in ["h2", "h6"]:
flag = 1
continue
# continue collecting text from the rest of the p tags
if flag == 1 and "thebestnotes" not in paragraph.text.strip().lower() and \
(paragraph.name == 'p' or
(paragraph.name in ["h2", "h6"] and paragraph.text.strip() == "")):
if paragraph.text.strip() != "":
section_paragraphs.append(paragraph.text.replace("\r\n","").strip())
else:
flag = 0
# end collecting the summary
if (len(section_paragraphs) > 0):
break
else:
for paragraph in soup.findAll(["p", "h2", "h6"]):
if section_title in unify_title(" ".join(paragraph.text.strip().lower().split())) and (paragraph.name in ["h2", "h6"] or (book_name == "Looking Backward: 2000-1887" and paragraph.name in ["p", "h2", "h6"])):
flag = 1
continue
# continue collecting text from the rest of the p tags
if flag == 1 and "thebestnotes" not in paragraph.text.strip().lower() and \
((paragraph.name in ["h2", "h6"] or (book_name == "Looking Backward: 2000-1887" and paragraph.name in ["p", "h2", "h6"])) and (paragraph.text.replace("\r\n","").strip() == "" or \
paragraph.text.strip().lower() == "summary")):
if paragraph.text.strip() != "":
flag = 2
continue
else:
continue
if flag == 2 and "thebestnotes" not in paragraph.text.strip().lower() and (paragraph.name == 'p' or (paragraph.name in ["h2", "h6"] and paragraph.text.strip() == "")):
if paragraph.text.strip() != "":
section_paragraphs.append(paragraph.text.replace("\r\n","").strip())
else:
if flag == 2:
flag = 0
# end collecting the summary
if (len(section_paragraphs) > 0):
break
# else there might be some text we missed
return section_paragraphs
def save_section_para(section_paragraphs, section_titles, section_title_orig, section, specific_summary_dir, index):
print ("save_section_para: ", section_titles[0], section)
section_text = "".join(section_paragraphs)
section_dict = {}
section_dict["name"] = section_title_orig
section_dict["summary"] = section_text
section_dict["analysis"] = ""
section_dict["url"] = section
output_fname = os.path.join(specific_summary_dir, 'section_%d.txt' % int(index))
# print ("output_fname: ", output_fname, "\n")
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(section_dict, fp)
# For each summary info
for k, (title, page_url) in enumerate(summary_infos):
print('\n>>> {}. {} - {} <<<'.format(k, title, page_url))
# Create a directory for the work if needed
specific_summary_dir = os.path.join(SUMMARY_DIR, title)
if not os.path.exists(specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory")
# continue
print ("page_url: ", page_url)
# Parse page
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except URLError as err:
print (err, page_url, "Retrying after sleep")
time.sleep(10)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
f_errors.write(page_url + "\t" + str(e))
f_errors.write("\n")
continue
one_level_up_url = os.path.dirname(page_url) + "/"
#fetch section
section_paragraphs = []
navigation_links = soup.findAll("a")
index = -1 # For section numbers
for link in navigation_links:
if 'synopsis' in " ".join(link.text.strip().lower().split()):
overview = urllib.parse.urljoin(one_level_up_url, link.get("href"))
overview_title = link.text.strip().lower()
print ("overview: ",overview)
overview_paragraphs = get_overview_paragraphs(overview)
overview_text = "".join(overview_paragraphs)
overview_dict = {}
overview_dict["name"] = "overview"
overview_dict["summary"] = overview_text
overview_dict["analysis"] = ""
overview_dict["url"] = overview
output_fname = os.path.join(specific_summary_dir, "overview.json")
with open(output_fname, 'w', encoding="utf-8") as fp:
json.dump(overview_dict, fp)
else:
section = urllib.parse.urljoin(one_level_up_url, link.get("href"))
section_title_orig = " ".join(link.text.strip().lower().split())
# Keep the original one first in the list of possible titles to match. For chapter numbers like "TWENTY-THREE" and "TWENTY-FOUR", which occur on the same web page.
section_titles = [section_title_orig]
# To handle cases where the og page says Chapter 1 - X, but the summary page just says X
# Add the different kind of section titles we can have into a list
if ('-' in section_title_orig):
section_titles = section_titles + section_title_orig.strip().split('-')
elif (':' in section_title_orig):
section_titles = section_titles + section_title_orig.strip().split(':')
if (section_title_orig == ""):
continue
section_paragraphs = get_section_paragraphs(section, section_titles, section_title_orig, specific_summary_dir, index)
if (section_paragraphs != []):
index += 1
save_section_para(section_paragraphs, section_titles, section_title_orig, section, specific_summary_dir, index)
================================================
FILE: scripts/data_collection/thebestnotes/get_works.py
================================================
"""
/*
* Copyright (c) 2021, salesforce.com, inc.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
* For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
"""
from builtins import zip, str, range
import pdb, os, csv, re, io, string
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
from tqdm import tqdm
from shutil import rmtree
from nltk.tokenize import word_tokenize, sent_tokenize
import time
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210111015641/http://thebestnotes.com/'
alphabet_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'QR', 'S', 'T', 'UV', 'W', 'XYZ']
SEED_URL = 'https://web.archive.org/web/20210111015641/http://thebestnotes.com/list/titles'
errors_file = open("link_errors.txt","w")
def scrape_index_pages(seed_page):
# For each summary info
scraped_links = []
for char in alphabet_list:
books_page = seed_page + char + ".html"
try:
soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser")
items = soup.findAll("div", {"class": "large-7 columns"})
books = items[0].findAll("p")
except Exception as e:
time.sleep(10)
# In order to handle timeouts
try:
soup = BeautifulSoup(urllib.request.urlopen(books_page), "html.parser")
items = soup.findAll("div", {"class": "large-7 columns"})
books = items[0].findAll("p")
except Exception as e:
print ("Skipping: ", books_page, str(e))
errors_file.write(books_page + "\t" + str(e) + "\n")
continue
# # # Go over each section
for index, item in enumerate(books):
# Parse section to get bullet point text
try:
item_title = item.find("a").text
item_url = item.find("a").get("href")
print ("item_title: ", " ".join(item_title.split()))
print ("item_url: ", item_url.strip(), "\n")
#Don't add the book to the list if it isn't freely available
if 'store' in item_url:
continue
scraped_links.append({
"title": " ".join(item_title.split()),
"url": urllib.parse.urljoin(MAIN_SITE, item_url.strip())
})
except Exception as e:
print ("Skipping: ", str(item), str(e), "\n")
errors_file.write(str(item) + "\t" + str(e) + "\n")
return scraped_links
# generate literature links
scraped_data = scrape_index_pages(SEED_URL)
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
================================================
FILE: scripts/data_collection/thebestnotes/literature_links.tsv.pruned
================================================
A Tale of Two Cities https://web.archive.org/web/20190903025221/http://thebestnotes.com/booknotes/Tale_Of_Two_Cities_Dickens/A_Tale_Of_Two_Cities_Study_Guide01.html
Little Women https://web.archive.org/web/20190903025235/http://thebestnotes.com/booknotes/Little_Women_Alcott/Little_Women_Study_Guide01.html
Pride and Prejudice https://web.archive.org/web/20190903025227/http://thebestnotes.com/booknotes/Pride_And_Prejudice_Austen/Pride_And_Prejudice_Study_Guide01.html
Ivanhoe https://web.archive.org/web/20190903025232/http://thebestnotes.com/booknotes/Ivanhoe_Sir_Walter_Scott/Ivanhoe_Study_Guide01.html
Treasure Island https://web.archive.org/web/20190903025221/http://thebestnotes.com/booknotes/Treasure_Island/Treasure_Island01.html
Around the World in 80 Days https://web.archive.org/web/20210417141554/http://thebestnotes.com/booknotes/Around_The_World_In_80_Days/Around_The_World_In_Eighty_Days01.html
White Fang https://web.archive.org/web/20190903025238/http://thebestnotes.com/booknotes/White_Fang/White_Fang01.html
Invisible Man https://web.archive.org/web/20190903025232/http://thebestnotes.com/booknotes/Invisible_Man_Ellison/Invisible_Man_Study_Guide01.html
Looking Backward: 2000-1887 https://web.archive.org/web/20190903025235/http://thebestnotes.com/booknotes/Looking_Backward_Bellamy/Looking_Backward_Study_Guide01.html