From b13813f96822ed7de314b6dd417831de8a641134 Mon Sep 17 00:00:00 2001 From: emkael Date: Fri, 31 Oct 2014 21:45:51 +0100 Subject: * revised and reorganized dump scripts --- dumps/_sites/__init__.py | 0 dumps/_sites/chicane-f1.py | 14 ++++++ dumps/_sites/second-a-lap.py | 18 ++++++++ dumps/chicane-f1/.gitignore | 1 + dumps/chicane-f1/csv/.gitignore | 1 + dumps/compile.py | 41 ++++++++++++++++++ dumps/dump.py | 20 +++++++++ dumps/second-a-lap.py | 90 --------------------------------------- dumps/second-a-lap/.gitignore | 1 - dumps/second-a-lap/csv/.gitignore | 1 + dumps/table2csv.py | 25 +++++++++++ 11 files changed, 121 insertions(+), 91 deletions(-) create mode 100644 dumps/_sites/__init__.py create mode 100644 dumps/_sites/chicane-f1.py create mode 100644 dumps/_sites/second-a-lap.py create mode 100644 dumps/chicane-f1/.gitignore create mode 100644 dumps/chicane-f1/csv/.gitignore create mode 100644 dumps/compile.py create mode 100755 dumps/dump.py delete mode 100755 dumps/second-a-lap.py create mode 100644 dumps/second-a-lap/csv/.gitignore create mode 100644 dumps/table2csv.py diff --git a/dumps/_sites/__init__.py b/dumps/_sites/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dumps/_sites/chicane-f1.py b/dumps/_sites/chicane-f1.py new file mode 100644 index 0000000..07d90c9 --- /dev/null +++ b/dumps/_sites/chicane-f1.py @@ -0,0 +1,14 @@ +import urllib +import urllib2 +import urlparse + +from lxml import html + +def fetch(url): + contents = urllib2.urlopen(url).read() + tree = html.fromstring(contents) + title = tree.xpath("//title")[0].text + tables = tree.xpath("//table[@cellpadding=6]") + print url + print title + return title, tables diff --git a/dumps/_sites/second-a-lap.py b/dumps/_sites/second-a-lap.py new file mode 100644 index 0000000..52591d6 --- /dev/null +++ b/dumps/_sites/second-a-lap.py @@ -0,0 +1,18 @@ +import json +import urllib +import urllib2 +import urlparse + +from lxml import html + +def fetch(url): + url = urlparse.urlparse(url).path + contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?' + + urllib.urlencode({'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url})).read()) + title = contents['feed']['entry'][0]['title']['$t'] + text = contents['feed']['entry'][0]['content']['$t'] + tree = html.fromstring(text) + tables = tree.xpath("//table[@bordercolor]") + print url + print title + return title, tables diff --git a/dumps/chicane-f1/.gitignore b/dumps/chicane-f1/.gitignore new file mode 100644 index 0000000..2211df6 --- /dev/null +++ b/dumps/chicane-f1/.gitignore @@ -0,0 +1 @@ +*.txt diff --git a/dumps/chicane-f1/csv/.gitignore b/dumps/chicane-f1/csv/.gitignore new file mode 100644 index 0000000..afed073 --- /dev/null +++ b/dumps/chicane-f1/csv/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/dumps/compile.py b/dumps/compile.py new file mode 100644 index 0000000..d87e154 --- /dev/null +++ b/dumps/compile.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +import csv +import string +from sys import argv + +def compile(files): + headers = set() + values = [] + writer = csv.writer(open('races.csv', 'w')) + race_id = 0 + for path in files: + try: + with open(path, 'r') as csvfile: + reader = csv.reader(csvfile) + header = next(reader) + headers = set(headers | set(header)) + for row in reader: + data = {} + i = 0 + for cell in row: + data[header[i]] = cell + data['Race'] = race_id + i += 1 + values.append(data) + writer.writerow([race_id, path, '', '', '']) + race_id += 1 + except IOError: + pass + headers.add('Race') + writer = csv.writer(open('compiled.csv', 'w')) + writer.writerow(list(headers)) + for row in values: + csvrow = [] + for name in headers: + if name in row: + csvrow.append(row[name]) + else: + csvrow.append('') + writer.writerow(csvrow) + diff --git a/dumps/dump.py b/dumps/dump.py new file mode 100755 index 0000000..b36e4a1 --- /dev/null +++ b/dumps/dump.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python + +from sys import argv +from compile import compile +from table2csv import convert + +if __name__ == "__main__": + if len(argv) > 1: + if argv[1] == 'compile': + compile(argv[2:]) + elif len(argv) > 2: + if argv[1] == 'fetch': + argv.remove('fetch') + fetch = __import__('_sites.' + argv[1], globals(), locals(), ['fetch']) + for url in argv[2:]: + title, tables = fetch.fetch(url) + i = 1 + for table in tables: + convert(table, title + '-' + str(i), argv[1]) + i += 1 diff --git a/dumps/second-a-lap.py b/dumps/second-a-lap.py deleted file mode 100755 index a6665c9..0000000 --- a/dumps/second-a-lap.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python - -import csv -import json -import os -import pprint -import re -import string -import urllib -import urllib2 -import urlparse -from sys import argv - -from bs4 import BeautifulSoup -from lxml import etree, html - - -def fetch(url): - print url - contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?' + - urllib.urlencode({'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url})).read()) - title = contents['feed']['entry'][0]['title']['$t'] - print title - text = contents['feed']['entry'][0]['content']['$t'] - tree = html.fromstring(text) - tables = tree.xpath("//table[@bordercolor]") - i = 1 - for table in tables: - name = "".join(x for x in title if x.isalnum()) + '-' + str(i) - print name - path = open(os.path.join('second-a-lap', name + '.txt'), 'w') - table = etree.tostring(table) - print >>path, table - csv_file = csv.writer( - open(os.path.join('second-a-lap', 'csv', name + '.csv'), 'w')) - soup = BeautifulSoup(table) - for row in soup.find_all('tr'): - row = map( - lambda t: re.sub('\s+', - ' ', - " ".join(t.stripped_strings)).encode('utf-8'), - row.find_all(re.compile('t[dh]'))) - csv_file.writerow(row) - i += 1 - - -def compile(files): - headers = set() - values = [] - writer = csv.writer(open('races.csv', 'w')) - race_id = 0 - for path in files: - try: - with open(path, 'r') as csvfile: - reader = csv.reader(csvfile) - header = next(reader) - headers = set(headers | set(header)) - for row in reader: - data = {} - i = 0 - for cell in row: - data[header[i]] = cell - data['Race'] = race_id - i += 1 - values.append(data) - writer.writerow([race_id, path, '', '', '']) - race_id += 1 - except IOError: - pass - headers.add('Race') - writer = csv.writer(open('compiled.csv', 'w')) - writer.writerow(list(headers)) - for row in values: - csvrow = [] - for name in headers: - if name in row: - csvrow.append(row[name]) - else: - csvrow.append('') - writer.writerow(csvrow) - -if __name__ == "__main__": - if len(argv) > 1: - if argv[1] == 'fetch' and len(argv) > 2: - for url in argv[2:]: - fetch(urlparse.urlparse(url).path) - elif argv[1] == 'compile': - compile(argv[2:]) - else: - fetch(urlparse.urlparse(argv[1]).path) diff --git a/dumps/second-a-lap/.gitignore b/dumps/second-a-lap/.gitignore index ea672e2..2211df6 100644 --- a/dumps/second-a-lap/.gitignore +++ b/dumps/second-a-lap/.gitignore @@ -1,2 +1 @@ *.txt -csv diff --git a/dumps/second-a-lap/csv/.gitignore b/dumps/second-a-lap/csv/.gitignore new file mode 100644 index 0000000..afed073 --- /dev/null +++ b/dumps/second-a-lap/csv/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/dumps/table2csv.py b/dumps/table2csv.py new file mode 100644 index 0000000..267b7d9 --- /dev/null +++ b/dumps/table2csv.py @@ -0,0 +1,25 @@ +import csv +import os +import re +import string + +from lxml import etree +from bs4 import BeautifulSoup + +def convert(table, title, output_dir): + name = "".join(x for x in title if x.isalnum()) + print name + path = open(os.path.join(output_dir, name + '.txt'), 'w') + table = etree.tostring(table) + print >>path, table + csv_file = csv.writer( + open(os.path.join(output_dir, 'csv', name + '.csv'), 'w')) + soup = BeautifulSoup(table) + for row in soup.find_all('tr'): + row = map( + lambda t: re.sub('\s+', + ' ', + " ".join(t.stripped_strings)).encode('utf-8'), + row.find_all(re.compile('t[dh]'))) + csv_file.writerow(row) + -- cgit v1.2.3