summaryrefslogtreecommitdiff
path: root/dumps/table2csv.py
diff options
context:
space:
mode:
authoremkael <emkael@tlen.pl>2014-10-31 21:45:51 +0100
committeremkael <emkael@tlen.pl>2014-10-31 21:45:51 +0100
commitb13813f96822ed7de314b6dd417831de8a641134 (patch)
tree0dc65154fd9531a31732e422619bb5b7aaebe775 /dumps/table2csv.py
parentae189fadac1f56b2c10f2a5d11bad99980a8fd46 (diff)
* revised and reorganized dump scripts
Diffstat (limited to 'dumps/table2csv.py')
-rw-r--r--dumps/table2csv.py25
1 files changed, 25 insertions, 0 deletions
diff --git a/dumps/table2csv.py b/dumps/table2csv.py
new file mode 100644
index 0000000..267b7d9
--- /dev/null
+++ b/dumps/table2csv.py
@@ -0,0 +1,25 @@
+import csv
+import os
+import re
+import string
+
+from lxml import etree
+from bs4 import BeautifulSoup
+
+def convert(table, title, output_dir):
+ name = "".join(x for x in title if x.isalnum())
+ print name
+ path = open(os.path.join(output_dir, name + '.txt'), 'w')
+ table = etree.tostring(table)
+ print >>path, table
+ csv_file = csv.writer(
+ open(os.path.join(output_dir, 'csv', name + '.csv'), 'w'))
+ soup = BeautifulSoup(table)
+ for row in soup.find_all('tr'):
+ row = map(
+ lambda t: re.sub('\s+',
+ ' ',
+ " ".join(t.stripped_strings)).encode('utf-8'),
+ row.find_all(re.compile('t[dh]')))
+ csv_file.writerow(row)
+