s and attributes since they are not needed anymore.
# 2) Rewrite relative paths ("/Brian-Bi") to full URLs
# 3) Download a copy of each embedded image
def cleanup_tree(doc, src, dest):
for child in src.childNodes:
if child.nodeType == Node.TEXT_NODE:
# Text nodes can simply be left as-is
dest.appendChild(child.cloneNode(False))
continue
if child.nodeType != Node.ELEMENT_NODE:
# ???
raise ValueError()
# Otherwise, it's an element node.
if child.tagName in ['br', 'hr']:
dest.appendChild(child.cloneNode(False))
elif child.tagName in ['b', 'i', 'u', 'h2', 'ol', 'ul', 'li', 'blockquote', 'wbr', 'p']:
# This node doesn't need to be modified but its children might.
# Also, we won't copy over any of its attributes.
new_node = doc.createElement(child.tagName)
cleanup_tree(doc, child, new_node)
dest.appendChild(new_node)
elif child.getAttribute('data-embed') != '':
# This is a video. We want to copy the data-embed value, which is HTML for an iframe node.
# So, we have to parse it into a separate document and import the node.
iframe_html = child.getAttribute('data-embed')
parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
iframe_doc = parser.parse(iframe_html)
try:
iframe = iframe_doc.documentElement.childNodes[1].firstChild
if iframe.tagName != 'iframe':
raise ValueError()
new_node = doc.importNode(iframe, False)
# Quora uses a protocol-relative URL (//youtube.com/...) so let's make sure we rewrite this.
src = new_node.getAttribute('src')
if src.startswith('//'):
new_node.setAttribute('src', 'http:' + src)
# The video will look really bad if we don't explicitly set the dimensions.
new_node.setAttribute('width', '525')
new_node.setAttribute('height', '295')
dest.appendChild(new_node)
except Exception:
print('[WARNING] Failed to parse video embed code', file=sys.stderr)
# Bail out by just copying the original HTML
dest.appendChild(child.cloneNode(True))
elif child.tagName == 'code':
# Inline code block. Strip the attributes.
new_node = doc.createElement('code')
dest.appendChild(new_node)
cleanup_tree(doc, child, new_node)
elif 'ContentFooter' in child.getAttribute('class') or 'hidden' in child.getAttribute('class'):
# These are nodes we just want to skip.
continue
elif child.tagName in ['span', 'div']:
# don't insert a span or div; just insert its contents
cleanup_tree(doc, child, dest)
elif child.tagName == 'a':
# A link. We only want to copy the href, and pass the rest through.
new_node = doc.createElement('a')
href = child.getAttribute('href')
if href.startswith('/'):
href = 'http://quora.com' + href
new_node.setAttribute('href', href)
dest.appendChild(new_node)
cleanup_tree(doc, child, new_node)
elif child.tagName == 'img':
src = child.getAttribute('master_src')
if src == '':
src = child.getAttribute('src')
new_node = doc.createElement('img')
new_node.setAttribute('src', src)
new_node.setAttribute('alt', child.getAttribute('alt'))
if args.no_download:
dest.appendChild(new_node)
continue
# Save a copy of the image locally.
# If an error occurs, just leave the src pointing to Quora.
try:
m = re.search('/([^/?]+)(\?|$)', src)
if m is None:
raise ValueError()
filename = m.group(1)
if not filename.endswith('.png'):
filename += '.png'
try:
img_fd = os.open(args.output_dir + '/' + filename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
except OSError as error:
if error.errno == errno.EEXIST:
log_if_v('Image %s has already been saved; skipping' % filename)
new_node.setAttribute('src', filename)
continue
else:
raise
log_if_v('Downloading image from %s' % src)
closed = False
try:
img = urllib.request.urlopen(src).read()
time.sleep(args.delay)
os.write(img_fd, img)
os.close(img_fd)
closed = True
except Exception:
os.close(img_fd)
closed = True
try:
os.remove(args.output_dir + '/' + filename)
except:
print('[WARNING] Failed to remove incomplete file %s' % filename, file=sys.stderr)
raise
finally:
if not closed:
os.close(img_fd)
# Don't leave the file there; we will retry it next time.
# If everything went according to plan, rewrite the src to the local file.
new_node.setAttribute('src', filename)
except urllib.error.URLError as error:
print('[WARNING] Failed to download image from URL %s (%s)' % (src, error.reason), file=sys.stderr)
except OSError as error:
print('[WARNING] Failed to save image from URL %s to file %s (%s)' % (src, filename, error.strerror), file=sys.stderr)
except ValueError:
print('[WARNING] Failed to determine image name from URL %s' % src, file=sys.stderr)
finally:
dest.appendChild(new_node)
elif child.tagName == 'pre':
# Block (not inline) code. Quora's HTML already has the desired structure,
# so we just need to strip the attributes from the .
new_node = doc.createElement('pre')
dest.appendChild(new_node)
cleanup_tree(doc, child, new_node)
else:
print('[WARNING] Unrecognized node', file=sys.stderr)
# Bail out by just copying the original HTML
dest.appendChild(child.cloneNode(True))
parser = argparse.ArgumentParser(description = 'Convert answers downloaded from Quora into a more portable HTML format')
parser.add_argument('input_dir', nargs='?', default='./quora-answers', help='directory containing "raw" answers downloaded from Quora')
parser.add_argument('output_dir', nargs='?', default='./quora-answers-cooked', help='where to store the images and converted answers')
parser.add_argument('-d', '--delay', default=0, type=float, help='Time to sleep between downloads, in seconds')
parser.add_argument('-n', '--no_download', action='store_true', help='Do not save images')
parser.add_argument('-v', '--verbose', action='store_true', help='be verbose')
global args
args = parser.parse_args()
# Get a list of answers to convert...
filenames = list(filter(lambda f: f.endswith('.html'), os.listdir(args.input_dir)))
filenames.sort()
if len(filenames) == 0:
sys.exit('[FATAL] No .html files found in directory %s', args.input_dir)
print('Found %d answers' % len(filenames), file=sys.stderr)
log_if_v('Creating directory %s' % args.output_dir)
try:
os.mkdir(args.output_dir, 0o700)
except OSError as error:
if error.errno == errno.EEXIST:
log_if_v('Directory already exists')
else:
# This is the top level, and we have nothing else to do if we failed
raise
for filename in filenames:
sys.stderr.flush()
print('Filename: ' + filename, file=sys.stderr)
try:
with open(args.input_dir + '/' + filename, 'rb') as page:
page_html = page.read()
except IOError as error:
print('[ERROR] Failed to read %s (%s)' % (filename, error.strerror))
continue
# Get the HTML element containing just the answer itself.
# Also get the title.
parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
document = parser.parse(page_html, default_encoding='utf-8')
title_node = get_title_node(document)
log_if_v('Title: ' + ('(could not be determined)' if title_node is None else get_text_content(title_node)))
answer_node = None
for node in document.getElementsByTagName('div'):
if 'ExpandedAnswer' in node.getAttribute('class').split():
answer_node = node
break
if answer_node is None:
print('[WARNING] Failed to locate answer on page (filename: %s)' % filename, file=sys.stderr)
continue
# Construct our new page...
new_page = document.createElement('html')
head_node = document.createElement('head')
if not title_node is None:
head_node.appendChild(title_node)
meta_node = document.createElement('meta')
meta_node.setAttribute('charset', 'utf-8')
head_node.appendChild(meta_node)
css = ("blockquote { border-left: 2px solid #ddd; color: #666; margin: 0; padding-left: 16px; } "
"code, pre { background: #f4f4f4; } "
"pre, h2 { margin: 0; } "
"ul { margin: 0 0 0 16px; padding: 8px 0; } "
"ol { margin: 0 0 0 28px; padding: 8px 0; } "
"li { margin: 0 0 8px; } ")
style_node = document.createElement('style')
style_node.setAttribute('type', 'text/css')
style_node.appendChild(document.createTextNode(css))
head_node.appendChild(style_node)
# Quora now uses MathJax, so set up the configuration object:
script_node = document.createElement('script')
script_text = document.createTextNode('window.MathJax = {"showMathMenu":false,"messageStyle":"none","errorSettings":{"style":{"color":"#000000","font-style":"normal"}},"HTML-CSS":{"linebreaks":{"automatic":true,"width":"container"},"EqnChunk":150,"EqnChunkDelay":20},"tex2jax":{"inlineMath":[["[math]","[/math]"]],"displayMath":[],"ignoreClass":"edit_latex|qtext_editor_content|ignore_latex","processClass":"render_latex","processEnvironments":false,"preview":"none"},"TeX":{"noUndefined":{"attributes":{"mathcolor":"red"}},"noErrors":{"multiLine":true,"style":{"max-width":"100%","overflow":"hidden"}},"Macros":{"C":"{\\mathbb{C}}","N":"{\\mathbb{N}}","O":"{\\emptyset}","Q":"{\\mathbb{Q}}","R":"{\\mathbb{R}}","Z":"{\\mathbb{Z}}"}},"fast-preview":{"disabled":true},"Safe":{"allow":{"URLs":"none","classes":"none","cssIDs":"none","styles":"none","fontsize":"none","require":"none"}}};')
script_node.appendChild(script_text)
head_node.appendChild(script_node)
# and then load MathJax:
script_node = document.createElement('script')
script_node.setAttribute('type', 'text/javascript')
script_node.setAttribute('src', 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js?config=TeX-AMS-MML_HTMLorMML,Safe')
head_node.appendChild(script_node)
new_page.appendChild(head_node)
body_node = document.createElement('body')
# This step processes Quora's HTML into a more lightweight and portable form.
cleanup_tree(document, answer_node, body_node)
new_page.appendChild(body_node)
# Okay! Finally, save the HTML.
walker = treewalkers.getTreeWalker('dom')(new_page)
try:
with open(args.output_dir + '/' + filename, 'wb', 0o600) as saved_page:
saved_page.write(b'')
saved_page.write(serializer.serialize(new_page, 'dom', 'utf-8', omit_optional_tags=False))
except IOError as error:
print('[ERROR] Failed to save to file %s (%s)' % (filename, error.strerror), file=sys.stderr)
print('Done', file=sys.stderr)
================================================
FILE: crawler.py
================================================
#!/usr/bin/env python3
import argparse
import errno
import json
import os
import re
import sys
import time
import urllib.error
import urllib.request
def log_if_v(msg):
if args.verbose:
print('[DEBUG] %s' % msg, file=sys.stderr)
# Given origin (timestamp offset by time zone) and string from Quora, e.g.
# "Added 31 Jan", returns a string such as '2015-01-31'.
# Quora's short date strings don't provide enough information to determine the
# exact time, unless it was within the last day, so we won't bother to be any
# more precise.
def parse_quora_date(origin, quora_str):
days_of_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
months_of_year = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
_, _, date_str = quora_str.partition('Added ')
date_str = date_str.strip()
if date_str == '':
raise ValueError('"%s" does not appear to indicate when answer was added' % quora_str)
m0 = re.match('just now$', date_str)
m1 = re.match('(\d+)m ago$', date_str)
m2 = re.match('(\d+)h ago$', date_str)
m3 = re.match('(' + '|'.join(days_of_week) + ')$', date_str)
m4 = re.match('(' + '|'.join(months_of_year) + ') (\d+)$', date_str)
m5 = re.match('(' + '|'.join(months_of_year) + ') (\d+), (\d+)$', date_str)
m6 = re.match('(\d+)[ap]m$', date_str)
if not m0 is None or not m6 is None:
# Using origin for time in am / pm since the time of the day will be discarded anyway
tm = time.gmtime(origin)
elif not m1 is None:
tm = time.gmtime(origin - 60*int(m1.group(1)))
elif not m2 is None:
tm = time.gmtime(origin - 3600*int(m2.group(1)))
elif not m3 is None:
# Walk backward until we reach the given day of the week
day_of_week = days_of_week.index(m3.group(1))
offset = 1
while offset <= 7:
tm = time.gmtime(origin - 86400*offset)
if tm.tm_wday == day_of_week:
break
offset += 1
else:
raise ValueError('date "%s" is invalid' % date_str)
elif not m4 is None:
# Walk backward until we reach the given month and year
month_of_year = months_of_year.index(m4.group(1)) + 1
day_of_month = int(m4.group(2))
offset = 1
while offset <= 366:
tm = time.gmtime(origin - 86400*offset)
if tm.tm_mon == month_of_year and tm.tm_mday == day_of_month:
break
offset += 1
else:
raise ValueError('date "%s" is invalid' % date_str)
elif not m5 is None:
# may raise ValueError
tm = time.strptime(date_str, '%b %d, %Y')
else:
raise ValueError('date "%s" could not be interpreted' % date_str)
return '%d-%02d-%02d' % (tm.tm_year, tm.tm_mon, tm.tm_mday)
parser = argparse.ArgumentParser(description = 'Download a set of answers from Quora')
parser.add_argument('input_file', help='file containing JSON-encoded list of timestamped URLs to download')
parser.add_argument('output_dir', nargs='?', default='./quora-answers', help='where to store the downloaded answers and images')
parser.add_argument('-d', '--delay', default=0, type=float, help='Time to sleep between answers, in seconds')
parser.add_argument('-t', '--origin_timestamp', default=None, type=int, help='JS time when the list of URLs was fetched')
parser.add_argument('-z', '--origin_timezone', default=None, type=int, help='browser timezone')
parser.add_argument('-v', '--verbose', action='store_true', help='enable debug messages')
parser.add_argument('-o', '--overwrite', action='store_true', help='Overwrite existing answers')
global args
args = parser.parse_args()
# Determine the origin for relative date computation
if args.origin_timestamp is None:
log_if_v('Using current time')
args.origin_timestamp = time.time()
else:
args.origin_timestamp //= 1000
if args.origin_timezone is None:
log_if_v('Using system time zone')
args.origin_timezone = time.timezone
else:
args.origin_timezone *= 60
origin = args.origin_timestamp - args.origin_timezone
# Load the list of answer URLs from the input file.
log_if_v('Loading input file %s' % args.input_file)
with open(args.input_file, 'rb') as input_file:
answers = json.load(input_file)
print('Found %d answers' % len(answers), file=sys.stderr)
# Check the validity of the input
if type(answers) != list:
sys.exit('[FATAL] Incorrect input format')
for e in answers:
if type(e) != list or len(e) != 2 or type(e[0]) != str or type(e[1]) != str:
sys.exit('[FATAL] Incorrect input format')
log_if_v('Creating directory %s' % args.output_dir)
try:
os.mkdir(args.output_dir, 0o700)
except OSError as error:
if error.errno == errno.EEXIST:
log_if_v('Directory already exists')
else:
# This is the top level, and we have nothing else to do if we failed
raise
os.chdir(args.output_dir)
download_file_count = 0
for e in answers:
sys.stderr.flush()
url = e[0]
print('URL: %s' % url, file=sys.stderr)
# Determine the date when this answer was written
try:
added_time = parse_quora_date(origin, e[1])
except ValueError as error:
print('[WARNING] Failed to parse date: %s' % str(error), file=sys.stderr)
added_time = 'xxxx-xx-xx'
print('Date: %s' % added_time, file=sys.stderr)
# Get the part of the URL indicating the question title; we will save under this name
m1 = re.search('quora\.com/([^/]+)/answer', url)
# if there's a context topic
m2 = re.search('quora\.com/[^/]+/([^/]+)/answer', url)
filename = added_time + ' '
if not m1 is None:
filename += m1.group(1)
elif not m2 is None:
filename += m2.group(1)
else:
print('[ERROR] Could not find question part of URL %s; skipping' % url, file=sys.stderr)
continue
# Trim the filename if it's too long. 255 bytes is the limit on many filesystems.
total_byte_length = len(bytes(filename + '.html', encoding="utf-8"))
filename_bytes = bytes(filename, encoding="utf-8")
if total_byte_length > 255:
filename_bytes = filename_bytes[:255-total_byte_length]
well_formed = False
while not well_formed:
try:
filename = str(filename_bytes, encoding="utf-8")
well_formed = True
except UnicodeDecodeError:
filename_bytes = filename_bytes[:-1]
log_if_v('Filename was truncated to at most 255 bytes.')
filename += '.html'
log_if_v('Filename: %s' % filename)
# If overwrite is enabled or the answer doesn't exist
if args.overwrite or not os.path.isfile(filename):
# Fetch the URL to find the answer
log_if_v('Downloading answer from URL %s' % url)
try:
page_html = urllib.request.urlopen(url).read()
with open(filename, 'wb') as f:
f.write(page_html)
except urllib.error.URLError as error:
print('[ERROR] Failed to download answer from URL %s (%s)' % (url, error.reason), file=sys.stderr)
continue
except IOError as error:
print('[ERROR] Failed to save answer to file %s (%s)' % (filename, error.strerror), file=sys.stderr)
download_file_count += 1
time.sleep(args.delay)
else:
log_if_v('Answer File : %s Already Exists. Skipping' % filename)
print('Done. Downloaded %d files' % download_file_count, file=sys.stderr)