summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2014-10-31 21:45:51 +0100
committeremkael <emkael@tlen.pl>2014-10-31 21:45:51 +0100
commitb13813f96822ed7de314b6dd417831de8a641134 (patch)
tree0dc65154fd9531a31732e422619bb5b7aaebe775
parentae189fadac1f56b2c10f2a5d11bad99980a8fd46 (diff)
* revised and reorganized dump scripts
-rw-r--r--dumps/_sites/__init__.py0
-rw-r--r--dumps/_sites/chicane-f1.py14
-rw-r--r--dumps/_sites/second-a-lap.py18
-rw-r--r--dumps/chicane-f1/.gitignore1
-rw-r--r--dumps/chicane-f1/csv/.gitignore1
-rw-r--r--dumps/compile.py41
-rwxr-xr-xdumps/dump.py20
-rwxr-xr-xdumps/second-a-lap.py90
-rw-r--r--dumps/second-a-lap/.gitignore1
-rw-r--r--dumps/second-a-lap/csv/.gitignore1
-rw-r--r--dumps/table2csv.py25
11 files changed, 121 insertions, 91 deletions
diff --git a/dumps/_sites/__init__.py b/dumps/_sites/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/dumps/_sites/__init__.py
diff --git a/dumps/_sites/chicane-f1.py b/dumps/_sites/chicane-f1.py
new file mode 100644
index 0000000..07d90c9
--- /dev/null
+++ b/dumps/_sites/chicane-f1.py
@@ -0,0 +1,14 @@
+import urllib
+import urllib2
+import urlparse
+
+from lxml import html
+
+def fetch(url):
+ contents = urllib2.urlopen(url).read()
+ tree = html.fromstring(contents)
+ title = tree.xpath("//title")[0].text
+ tables = tree.xpath("//table[@cellpadding=6]")
+ print url
+ print title
+ return title, tables
diff --git a/dumps/_sites/second-a-lap.py b/dumps/_sites/second-a-lap.py
new file mode 100644
index 0000000..52591d6
--- /dev/null
+++ b/dumps/_sites/second-a-lap.py
@@ -0,0 +1,18 @@
+import json
+import urllib
+import urllib2
+import urlparse
+
+from lxml import html
+
+def fetch(url):
+ url = urlparse.urlparse(url).path
+ contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?' +
+ urllib.urlencode({'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url})).read())
+ title = contents['feed']['entry'][0]['title']['$t']
+ text = contents['feed']['entry'][0]['content']['$t']
+ tree = html.fromstring(text)
+ tables = tree.xpath("//table[@bordercolor]")
+ print url
+ print title
+ return title, tables
diff --git a/dumps/chicane-f1/.gitignore b/dumps/chicane-f1/.gitignore
new file mode 100644
index 0000000..2211df6
--- /dev/null
+++ b/dumps/chicane-f1/.gitignore
@@ -0,0 +1 @@
+*.txt
diff --git a/dumps/chicane-f1/csv/.gitignore b/dumps/chicane-f1/csv/.gitignore
new file mode 100644
index 0000000..afed073
--- /dev/null
+++ b/dumps/chicane-f1/csv/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/dumps/compile.py b/dumps/compile.py
new file mode 100644
index 0000000..d87e154
--- /dev/null
+++ b/dumps/compile.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+import csv
+import string
+from sys import argv
+
+def compile(files):
+ headers = set()
+ values = []
+ writer = csv.writer(open('races.csv', 'w'))
+ race_id = 0
+ for path in files:
+ try:
+ with open(path, 'r') as csvfile:
+ reader = csv.reader(csvfile)
+ header = next(reader)
+ headers = set(headers | set(header))
+ for row in reader:
+ data = {}
+ i = 0
+ for cell in row:
+ data[header[i]] = cell
+ data['Race'] = race_id
+ i += 1
+ values.append(data)
+ writer.writerow([race_id, path, '', '', ''])
+ race_id += 1
+ except IOError:
+ pass
+ headers.add('Race')
+ writer = csv.writer(open('compiled.csv', 'w'))
+ writer.writerow(list(headers))
+ for row in values:
+ csvrow = []
+ for name in headers:
+ if name in row:
+ csvrow.append(row[name])
+ else:
+ csvrow.append('')
+ writer.writerow(csvrow)
+
diff --git a/dumps/dump.py b/dumps/dump.py
new file mode 100755
index 0000000..b36e4a1
--- /dev/null
+++ b/dumps/dump.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+from sys import argv
+from compile import compile
+from table2csv import convert
+
+if __name__ == "__main__":
+ if len(argv) > 1:
+ if argv[1] == 'compile':
+ compile(argv[2:])
+ elif len(argv) > 2:
+ if argv[1] == 'fetch':
+ argv.remove('fetch')
+ fetch = __import__('_sites.' + argv[1], globals(), locals(), ['fetch'])
+ for url in argv[2:]:
+ title, tables = fetch.fetch(url)
+ i = 1
+ for table in tables:
+ convert(table, title + '-' + str(i), argv[1])
+ i += 1
diff --git a/dumps/second-a-lap.py b/dumps/second-a-lap.py
deleted file mode 100755
index a6665c9..0000000
--- a/dumps/second-a-lap.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env python
-
-import csv
-import json
-import os
-import pprint
-import re
-import string
-import urllib
-import urllib2
-import urlparse
-from sys import argv
-
-from bs4 import BeautifulSoup
-from lxml import etree, html
-
-
-def fetch(url):
- print url
- contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?' +
- urllib.urlencode({'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url})).read())
- title = contents['feed']['entry'][0]['title']['$t']
- print title
- text = contents['feed']['entry'][0]['content']['$t']
- tree = html.fromstring(text)
- tables = tree.xpath("//table[@bordercolor]")
- i = 1
- for table in tables:
- name = "".join(x for x in title if x.isalnum()) + '-' + str(i)
- print name
- path = open(os.path.join('second-a-lap', name + '.txt'), 'w')
- table = etree.tostring(table)
- print >>path, table
- csv_file = csv.writer(
- open(os.path.join('second-a-lap', 'csv', name + '.csv'), 'w'))
- soup = BeautifulSoup(table)
- for row in soup.find_all('tr'):
- row = map(
- lambda t: re.sub('\s+',
- ' ',
- " ".join(t.stripped_strings)).encode('utf-8'),
- row.find_all(re.compile('t[dh]')))
- csv_file.writerow(row)
- i += 1
-
-
-def compile(files):
- headers = set()
- values = []
- writer = csv.writer(open('races.csv', 'w'))
- race_id = 0
- for path in files:
- try:
- with open(path, 'r') as csvfile:
- reader = csv.reader(csvfile)
- header = next(reader)
- headers = set(headers | set(header))
- for row in reader:
- data = {}
- i = 0
- for cell in row:
- data[header[i]] = cell
- data['Race'] = race_id
- i += 1
- values.append(data)
- writer.writerow([race_id, path, '', '', ''])
- race_id += 1
- except IOError:
- pass
- headers.add('Race')
- writer = csv.writer(open('compiled.csv', 'w'))
- writer.writerow(list(headers))
- for row in values:
- csvrow = []
- for name in headers:
- if name in row:
- csvrow.append(row[name])
- else:
- csvrow.append('')
- writer.writerow(csvrow)
-
-if __name__ == "__main__":
- if len(argv) > 1:
- if argv[1] == 'fetch' and len(argv) > 2:
- for url in argv[2:]:
- fetch(urlparse.urlparse(url).path)
- elif argv[1] == 'compile':
- compile(argv[2:])
- else:
- fetch(urlparse.urlparse(argv[1]).path)
diff --git a/dumps/second-a-lap/.gitignore b/dumps/second-a-lap/.gitignore
index ea672e2..2211df6 100644
--- a/dumps/second-a-lap/.gitignore
+++ b/dumps/second-a-lap/.gitignore
@@ -1,2 +1 @@
*.txt
-csv
diff --git a/dumps/second-a-lap/csv/.gitignore b/dumps/second-a-lap/csv/.gitignore
new file mode 100644
index 0000000..afed073
--- /dev/null
+++ b/dumps/second-a-lap/csv/.gitignore
@@ -0,0 +1 @@
+*.csv
diff --git a/dumps/table2csv.py b/dumps/table2csv.py
new file mode 100644
index 0000000..267b7d9
--- /dev/null
+++ b/dumps/table2csv.py
@@ -0,0 +1,25 @@
+import csv
+import os
+import re
+import string
+
+from lxml import etree
+from bs4 import BeautifulSoup
+
+def convert(table, title, output_dir):
+ name = "".join(x for x in title if x.isalnum())
+ print name
+ path = open(os.path.join(output_dir, name + '.txt'), 'w')
+ table = etree.tostring(table)
+ print >>path, table
+ csv_file = csv.writer(
+ open(os.path.join(output_dir, 'csv', name + '.csv'), 'w'))
+ soup = BeautifulSoup(table)
+ for row in soup.find_all('tr'):
+ row = map(
+ lambda t: re.sub('\s+',
+ ' ',
+ " ".join(t.stripped_strings)).encode('utf-8'),
+ row.find_all(re.compile('t[dh]')))
+ csv_file.writerow(row)
+