diff options
author | emkael <emkael@tlen.pl> | 2014-10-31 21:45:51 +0100 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2014-10-31 21:45:51 +0100 |
commit | b13813f96822ed7de314b6dd417831de8a641134 (patch) | |
tree | 0dc65154fd9531a31732e422619bb5b7aaebe775 /dumps/table2csv.py | |
parent | ae189fadac1f56b2c10f2a5d11bad99980a8fd46 (diff) |
* revised and reorganized dump scripts
Diffstat (limited to 'dumps/table2csv.py')
-rw-r--r-- | dumps/table2csv.py | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/dumps/table2csv.py b/dumps/table2csv.py new file mode 100644 index 0000000..267b7d9 --- /dev/null +++ b/dumps/table2csv.py @@ -0,0 +1,25 @@ +import csv +import os +import re +import string + +from lxml import etree +from bs4 import BeautifulSoup + +def convert(table, title, output_dir): + name = "".join(x for x in title if x.isalnum()) + print name + path = open(os.path.join(output_dir, name + '.txt'), 'w') + table = etree.tostring(table) + print >>path, table + csv_file = csv.writer( + open(os.path.join(output_dir, 'csv', name + '.csv'), 'w')) + soup = BeautifulSoup(table) + for row in soup.find_all('tr'): + row = map( + lambda t: re.sub('\s+', + ' ', + " ".join(t.stripped_strings)).encode('utf-8'), + row.find_all(re.compile('t[dh]'))) + csv_file.writerow(row) + |