From e8adaa0eb15f1b83a430ec35046b4d2756e02885 Mon Sep 17 00:00:00 2001 From: emkael Date: Sun, 12 Oct 2014 22:12:30 +0200 Subject: * Second A Lap blog scraper --- dumps/second-a-lap.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 dumps/second-a-lap.py (limited to 'dumps/second-a-lap.py') diff --git a/dumps/second-a-lap.py b/dumps/second-a-lap.py new file mode 100755 index 0000000..e8221b5 --- /dev/null +++ b/dumps/second-a-lap.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +from sys import argv +import urlparse, urllib, urllib2 +import json, pprint +from lxml import html, etree +import os, string + +def fetch(url): + print url + contents = json.loads(urllib2.urlopen('http://second-a-lap.blogspot.com/feeds/posts/default?'+urllib.urlencode({ 'alt': 'json', 'v': 2, 'dynamicviews': 1, 'path': url })).read()) + title = contents['feed']['entry'][0]['title']['$t'] + print title + text = contents['feed']['entry'][0]['content']['$t'] + tree = html.fromstring(text) + tables = tree.xpath("//table[@bordercolor]") + i = 1 + for table in tables: + name = "".join(x for x in title if x.isalnum()) + '-' + str(i) + '.txt' + print name + path = open(name, 'w') + print >>path, etree.tostring(table) + i += 1 + +if __name__ == "__main__": + if len(argv) > 1: + url = urlparse.urlparse(argv[1]) + fetch(url.path) -- cgit v1.2.3