diff options
author | emkael <emkael@tlen.pl> | 2014-10-12 22:12:30 +0200 |
---|---|---|
committer | emkael <emkael@tlen.pl> | 2014-10-12 22:12:30 +0200 |
commit | e8adaa0eb15f1b83a430ec35046b4d2756e02885 (patch) | |
tree | 9582d5cc4410e74654b157c4a7f1c270285cebb0 /dumps/second-a-lap.py | |
parent | 3d39c79d532800a8bdb7e410a1d441dfbc31cb01 (diff) |
* Second A Lap blog scraper
Diffstat (limited to 'dumps/second-a-lap.py')
-rwxr-xr-x | dumps/second-a-lap.py | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/dumps/second-a-lap.py b/dumps/second-a-lap.py new file mode 100755 index 0000000..e8221b5 --- /dev/null +++ b/dumps/second-a-lap.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +from sys import argv +import urlparse, urllib, urllib2 +import json, pprint +from lxml import html, etree +import os, string + +def fetch(url): + print url + contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?'+urllib.urlencode({ 'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url })).read()) + title = contents['feed']['entry'][0]['title']['$t'] + print title + text = contents['feed']['entry'][0]['content']['$t'] + tree = html.fromstring(text) + tables = tree.xpath("//table[@bordercolor]") + i = 1 + for table in tables: + name = "".join(x for x in title if x.isalnum()) + '-' + str(i) + '.txt' + print name + path = open(name, 'w') + print >>path, etree.tostring(table) + i += 1 + +if __name__ == "__main__": + if len(argv) > 1: + url = urlparse.urlparse(argv[1]) + fetch(url.path) |