summaryrefslogtreecommitdiff
path: root/dumps
diff options
context:
space:
mode:
Diffstat (limited to 'dumps')
-rwxr-xr-xdumps/second-a-lap.py55
1 files changed, 49 insertions, 6 deletions
diff --git a/dumps/second-a-lap.py b/dumps/second-a-lap.py
index 12cf514..dc9b86b 100755
--- a/dumps/second-a-lap.py
+++ b/dumps/second-a-lap.py
@@ -4,7 +4,8 @@ from sys import argv
import urlparse, urllib, urllib2
import json, pprint
from lxml import html, etree
-import os, string
+from bs4 import BeautifulSoup
+import os, string, re, csv
def fetch(url):
print url
@@ -16,13 +17,55 @@ def fetch(url):
tables = tree.xpath("//table[@bordercolor]")
i = 1
for table in tables:
- name = "".join(x for x in title if x.isalnum()) + '-' + str(i) + '.txt'
+ name = "".join(x for x in title if x.isalnum()) + '-' + str(i)
print name
- path = open('second-a-lap/' + name, 'w')
- print >>path, etree.tostring(table)
+ path = open('second-a-lap/' + name + '.txt', 'w')
+ table = etree.tostring(table)
+ print >>path, table
+ csv_file = csv.writer(open('second-a-lap/csv/' + name + '.csv', 'w'))
+ soup = BeautifulSoup(table)
+ for row in soup.find_all('tr'):
+ row = map(lambda t: re.sub('\s+', ' ', " ".join(t.stripped_strings)).encode('utf-8'), row.find_all(re.compile('t[dh]')))
+ csv_file.writerow(row)
i += 1
+def compile(files):
+ headers = set()
+ values = []
+ for path in files:
+ try:
+ with open(path, 'r') as csvfile:
+ reader = csv.reader(csvfile)
+ header = next(reader)
+ headers = set(headers | set(header))
+ for row in reader:
+ data = {}
+ i = 0
+ for cell in row:
+ data[header[i]] = cell
+ i += 1
+ values.append(data)
+ except IOError:
+ pass
+ writer = csv.writer(open('compiled.csv', 'w'))
+ writer.writerow(list(headers))
+ for row in values:
+ csvrow = []
+ for name in headers:
+ if name in row:
+ csvrow.append(row[name])
+ else:
+ csvrow.append('')
+ writer.writerow(csvrow)
+
if __name__ == "__main__":
if len(argv) > 1:
- url = urlparse.urlparse(argv[1])
- fetch(url.path)
+ if argv[1] == 'fetch' and len(argv) > 2:
+ url = urlparse.urlparse(argv[2])
+ fetch(url.path)
+ elif argv[1] == 'compile':
+ files = argv[2:]
+ compile(files)
+ else:
+ url = urlparse.urlparse(argv[1])
+ fetch(url.path)