summaryrefslogtreecommitdiff
path: root/dumps/table2csv.py
blob: 267b7d965c65a39f9835652c042cebc022166ed6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import csv
import os
import re
import string

from lxml import etree
from bs4 import BeautifulSoup

def convert(table, title, output_dir):
    name = "".join(x for x in title if x.isalnum())
    print name
    path = open(os.path.join(output_dir, name + '.txt'), 'w')
    table = etree.tostring(table)
    print >>path, table
    csv_file = csv.writer(
        open(os.path.join(output_dir, 'csv', name + '.csv'), 'w'))
    soup = BeautifulSoup(table)
    for row in soup.find_all('tr'):
        row = map(
            lambda t: re.sub('\s+',
                             ' ',
                             " ".join(t.stripped_strings)).encode('utf-8'),
            row.find_all(re.compile('t[dh]')))
        csv_file.writerow(row)