diff options
Diffstat (limited to 'dumps')
-rwxr-xr-x | dumps/second-a-lap.py | 55 |
1 files changed, 49 insertions, 6 deletions
diff --git a/dumps/second-a-lap.py b/dumps/second-a-lap.py index 12cf514..dc9b86b 100755 --- a/dumps/second-a-lap.py +++ b/dumps/second-a-lap.py @@ -4,7 +4,8 @@ from sys import argv import urlparse, urllib, urllib2 import json, pprint from lxml import html, etree -import os, string +from bs4 import BeautifulSoup +import os, string, re, csv def fetch(url): print url @@ -16,13 +17,55 @@ def fetch(url): tables = tree.xpath("//table[@bordercolor]") i = 1 for table in tables: - name = "".join(x for x in title if x.isalnum()) + '-' + str(i) + '.txt' + name = "".join(x for x in title if x.isalnum()) + '-' + str(i) print name - path = open('second-a-lap/' + name, 'w') - print >>path, etree.tostring(table) + path = open('second-a-lap/' + name + '.txt', 'w') + table = etree.tostring(table) + print >>path, table + csv_file = csv.writer(open('second-a-lap/csv/' + name + '.csv', 'w')) + soup = BeautifulSoup(table) + for row in soup.find_all('tr'): + row = map(lambda t: re.sub('\s+', ' ', " ".join(t.stripped_strings)).encode('utf-8'), row.find_all(re.compile('t[dh]'))) + csv_file.writerow(row) i += 1 +def compile(files): + headers = set() + values = [] + for path in files: + try: + with open(path, 'r') as csvfile: + reader = csv.reader(csvfile) + header = next(reader) + headers = set(headers | set(header)) + for row in reader: + data = {} + i = 0 + for cell in row: + data[header[i]] = cell + i += 1 + values.append(data) + except IOError: + pass + writer = csv.writer(open('compiled.csv', 'w')) + writer.writerow(list(headers)) + for row in values: + csvrow = [] + for name in headers: + if name in row: + csvrow.append(row[name]) + else: + csvrow.append('') + writer.writerow(csvrow) + if __name__ == "__main__": if len(argv) > 1: - url = urlparse.urlparse(argv[1]) - fetch(url.path) + if argv[1] == 'fetch' and len(argv) > 2: + url = urlparse.urlparse(argv[2]) + fetch(url.path) + elif argv[1] == 'compile': + files = argv[2:] + compile(files) + else: + url = urlparse.urlparse(argv[1]) + fetch(url.path) |