summaryrefslogtreecommitdiff
path: root/dumps/second-a-lap.py
blob: afc76bce3456a5aff5eeb1b4569e78e827248dc4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python

from sys import argv
import urlparse, urllib, urllib2
import json, pprint
from lxml import html, etree
from bs4 import BeautifulSoup
import os, string, re, csv

def fetch(url):
    print url
    contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?'+urllib.urlencode({ 'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url })).read())
    title = contents['feed']['entry'][0]['title']['$t']
    print title
    text = contents['feed']['entry'][0]['content']['$t']
    tree = html.fromstring(text)
    tables = tree.xpath("//table[@bordercolor]")
    i = 1
    for table in tables:
        name = "".join(x for x in title if x.isalnum()) + '-' + str(i)
        print name
        path = open('second-a-lap/' + name + '.txt', 'w')
        table = etree.tostring(table)
        print >>path, table
        csv_file = csv.writer(open('second-a-lap/csv/' + name + '.csv', 'w'))
        soup = BeautifulSoup(table)
        for row in soup.find_all('tr'):
            row = map(lambda t: re.sub('\s+', ' ', " ".join(t.stripped_strings)).encode('utf-8'), row.find_all(re.compile('t[dh]')))
            csv_file.writerow(row)
        i += 1

def compile(files):
    headers = set()
    values = []
    writer = csv.writer(open('races.csv', 'w'))
    race_id = 0
    for path in files:
        try:
            with open(path, 'r') as csvfile:
                reader = csv.reader(csvfile)
                header = next(reader)
                headers = set(headers | set(header))
                for row in reader:
                    data = {}
                    i = 0
                    for cell in row:
                        data[header[i]] = cell
                        data['Race'] = race_id
                        i += 1
                    values.append(data)
                writer.writerow([race_id,path,'','',''])
                race_id += 1
        except IOError:
            pass
    headers.add('Race')
    writer = csv.writer(open('compiled.csv', 'w'))
    writer.writerow(list(headers))
    for row in values:
        csvrow = []
        for name in headers:
            if name in row:
                csvrow.append(row[name])
            else:
                csvrow.append('')
        writer.writerow(csvrow)

if __name__ == "__main__":
    if len(argv) > 1:
        if argv[1] == 'fetch' and len(argv) > 2:
            for url in argv[2:]:
                fetch(urlparse.urlparse(url).path)
        elif argv[1] == 'compile':
            compile(argv[2:])
        else:
            fetch(urlparse.urlparse(argv[1]).path)