1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
#!/usr/bin/env python
import csv
import json
import os
import pprint
import re
import string
import urllib
import urllib2
import urlparse
from sys import argv
from lxml import etree, html
from bs4 import BeautifulSoup
def fetch(url):
print url
contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?'+urllib.urlencode({ 'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url })).read())
title = contents['feed']['entry'][0]['title']['$t']
print title
text = contents['feed']['entry'][0]['content']['$t']
tree = html.fromstring(text)
tables = tree.xpath("//table[@bordercolor]")
i = 1
for table in tables:
name = "".join(x for x in title if x.isalnum()) + '-' + str(i)
print name
path = open(os.path.join('second-a-lap', name + '.txt'), 'w')
table = etree.tostring(table)
print >>path, table
csv_file = csv.writer(open(os.path.join('second-a-lap', 'csv', name + '.csv'), 'w'))
soup = BeautifulSoup(table)
for row in soup.find_all('tr'):
row = map(lambda t: re.sub('\s+', ' ', " ".join(t.stripped_strings)).encode('utf-8'), row.find_all(re.compile('t[dh]')))
csv_file.writerow(row)
i += 1
def compile(files):
headers = set()
values = []
writer = csv.writer(open('races.csv', 'w'))
race_id = 0
for path in files:
try:
with open(path, 'r') as csvfile:
reader = csv.reader(csvfile)
header = next(reader)
headers = set(headers | set(header))
for row in reader:
data = {}
i = 0
for cell in row:
data[header[i]] = cell
data['Race'] = race_id
i += 1
values.append(data)
writer.writerow([race_id,path,'','',''])
race_id += 1
except IOError:
pass
headers.add('Race')
writer = csv.writer(open('compiled.csv', 'w'))
writer.writerow(list(headers))
for row in values:
csvrow = []
for name in headers:
if name in row:
csvrow.append(row[name])
else:
csvrow.append('')
writer.writerow(csvrow)
if __name__ == "__main__":
if len(argv) > 1:
if argv[1] == 'fetch' and len(argv) > 2:
for url in argv[2:]:
fetch(urlparse.urlparse(url).path)
elif argv[1] == 'compile':
compile(argv[2:])
else:
fetch(urlparse.urlparse(argv[1]).path)
|