1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
#!/usr/bin/env python
from sys import argv
import urlparse, urllib, urllib2
import json, pprint
from lxml import html, etree
from bs4 import BeautifulSoup
import os, string, re, csv
def fetch(url):
print url
contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?'+urllib.urlencode({ 'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url })).read())
title = contents['feed']['entry'][0]['title']['$t']
print title
text = contents['feed']['entry'][0]['content']['$t']
tree = html.fromstring(text)
tables = tree.xpath("//table[@bordercolor]")
i = 1
for table in tables:
name = "".join(x for x in title if x.isalnum()) + '-' + str(i)
print name
path = open('second-a-lap/' + name + '.txt', 'w')
table = etree.tostring(table)
print >>path, table
csv_file = csv.writer(open('second-a-lap/csv/' + name + '.csv', 'w'))
soup = BeautifulSoup(table)
for row in soup.find_all('tr'):
row = map(lambda t: re.sub('\s+', ' ', " ".join(t.stripped_strings)).encode('utf-8'), row.find_all(re.compile('t[dh]')))
csv_file.writerow(row)
i += 1
def compile(files):
headers = set()
values = []
writer = csv.writer(open('races.csv', 'w'))
race_id = 0
for path in files:
try:
with open(path, 'r') as csvfile:
reader = csv.reader(csvfile)
header = next(reader)
headers = set(headers | set(header))
for row in reader:
data = {}
i = 0
for cell in row:
data[header[i]] = cell
data['Race'] = race_id
i += 1
values.append(data)
writer.writerow([race_id,path,'','',''])
race_id += 1
except IOError:
pass
headers.add('Race')
writer = csv.writer(open('compiled.csv', 'w'))
writer.writerow(list(headers))
for row in values:
csvrow = []
for name in headers:
if name in row:
csvrow.append(row[name])
else:
csvrow.append('')
writer.writerow(csvrow)
if __name__ == "__main__":
if len(argv) > 1:
if argv[1] == 'fetch' and len(argv) > 2:
for url in argv[2:]:
fetch(urlparse.urlparse(url).path)
elif argv[1] == 'compile':
compile(argv[2:])
else:
fetch(urlparse.urlparse(argv[1]).path)
|