summaryrefslogtreecommitdiff
path: root/dumps/second-a-lap.py
blob: a6665c96bb5c074cfdb0989be6f1dbda5e0a5cd6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python

import csv
import json
import os
import pprint
import re
import string
import urllib
import urllib2
import urlparse
from sys import argv

from bs4 import BeautifulSoup
from lxml import etree, html


def fetch(url):
    print url
    contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?' +
                          urllib.urlencode({'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url})).read())
    title = contents['feed']['entry'][0]['title']['$t']
    print title
    text = contents['feed']['entry'][0]['content']['$t']
    tree = html.fromstring(text)
    tables = tree.xpath("//table[@bordercolor]")
    i = 1
    for table in tables:
        name = "".join(x for x in title if x.isalnum()) + '-' + str(i)
        print name
        path = open(os.path.join('second-a-lap', name + '.txt'), 'w')
        table = etree.tostring(table)
        print >>path, table
        csv_file = csv.writer(
            open(os.path.join('second-a-lap', 'csv', name + '.csv'), 'w'))
        soup = BeautifulSoup(table)
        for row in soup.find_all('tr'):
            row = map(
                lambda t: re.sub('\s+',
                                 ' ',
                                 " ".join(t.stripped_strings)).encode('utf-8'),
                row.find_all(re.compile('t[dh]')))
            csv_file.writerow(row)
        i += 1


def compile(files):
    headers = set()
    values = []
    writer = csv.writer(open('races.csv', 'w'))
    race_id = 0
    for path in files:
        try:
            with open(path, 'r') as csvfile:
                reader = csv.reader(csvfile)
                header = next(reader)
                headers = set(headers | set(header))
                for row in reader:
                    data = {}
                    i = 0
                    for cell in row:
                        data[header[i]] = cell
                        data['Race'] = race_id
                        i += 1
                    values.append(data)
                writer.writerow([race_id, path, '', '', ''])
                race_id += 1
        except IOError:
            pass
    headers.add('Race')
    writer = csv.writer(open('compiled.csv', 'w'))
    writer.writerow(list(headers))
    for row in values:
        csvrow = []
        for name in headers:
            if name in row:
                csvrow.append(row[name])
            else:
                csvrow.append('')
        writer.writerow(csvrow)

if __name__ == "__main__":
    if len(argv) > 1:
        if argv[1] == 'fetch' and len(argv) > 2:
            for url in argv[2:]:
                fetch(urlparse.urlparse(url).path)
        elif argv[1] == 'compile':
            compile(argv[2:])
        else:
            fetch(urlparse.urlparse(argv[1]).path)