| 1 | #! /usr/bin/env python |
|---|
| 2 | # -*- encoding: utf-8 -*- |
|---|
| 3 | |
|---|
| 4 | # link extractor for rss |
|---|
| 5 | |
|---|
| 6 | # sample: |
|---|
| 7 | # http://localhost:8080/beta/rss?uri=http://whym.tumblr.com/rss&fil=&sub= |
|---|
| 8 | |
|---|
| 9 | from xml.dom import minidom |
|---|
| 10 | import urlparse |
|---|
| 11 | import urllib |
|---|
| 12 | import sys |
|---|
| 13 | import re |
|---|
| 14 | from cgi import parse_qsl |
|---|
| 15 | import yaml |
|---|
| 16 | import os |
|---|
| 17 | |
|---|
| 18 | from urllib import FancyURLopener |
|---|
| 19 | class MyOpener(FancyURLopener): |
|---|
| 20 | version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11' |
|---|
| 21 | myopener = MyOpener() |
|---|
| 22 | urllib.urlopen = myopener.open |
|---|
| 23 | urllib.urlretrieve = myopener.retrieve |
|---|
| 24 | |
|---|
| 25 | def get_args(): |
|---|
| 26 | ret = yaml.load(open('rsslinker.yaml')) |
|---|
| 27 | if not ret: |
|---|
| 28 | ret = {} |
|---|
| 29 | args = os.environ |
|---|
| 30 | if args.has_key('QUERY_STRING'): |
|---|
| 31 | for (x,y) in parse_qsl(args['QUERY_STRING']): |
|---|
| 32 | ret[x] = y |
|---|
| 33 | return ret |
|---|
| 34 | |
|---|
| 35 | def extract_links(html): |
|---|
| 36 | ret = [] |
|---|
| 37 | for m in re.finditer(r'<a .*?href="(.*?)".*?>(.*?)</a>', html): |
|---|
| 38 | ret.append((m.group(2), m.group(1))) |
|---|
| 39 | return ret |
|---|
| 40 | |
|---|
| 41 | def shorten_url(url, shortener, tag='shortUrl'): |
|---|
| 42 | doc = minidom.parseString(urllib.urlopen(shortener % urllib.quote(url)).read()) |
|---|
| 43 | url = doc.getElementsByTagName(tag)[0].firstChild.data |
|---|
| 44 | return url |
|---|
| 45 | |
|---|
| 46 | if __name__ == '__main__': |
|---|
| 47 | print 'Content-Type: text/xml; charset="UTF-8"' |
|---|
| 48 | print '' |
|---|
| 49 | |
|---|
| 50 | uri = None |
|---|
| 51 | targettag = 'description' |
|---|
| 52 | span_pat = r'.*' |
|---|
| 53 | shortener = None |
|---|
| 54 | itemtag = 'item' |
|---|
| 55 | |
|---|
| 56 | h = get_args() |
|---|
| 57 | if h.has_key('uri'): |
|---|
| 58 | uri = h['uri'] |
|---|
| 59 | if h.has_key('target'): |
|---|
| 60 | targettag = h['target'] |
|---|
| 61 | if h.has_key('span'): |
|---|
| 62 | span_pat = re.compile(h['span']) |
|---|
| 63 | if h.has_key('shortener'): |
|---|
| 64 | shortener = h['shortener'] |
|---|
| 65 | |
|---|
| 66 | if not uri: |
|---|
| 67 | print '<error>no uri provided</error>' |
|---|
| 68 | sys.exit() |
|---|
| 69 | |
|---|
| 70 | host = urlparse.urlparse(uri).hostname |
|---|
| 71 | |
|---|
| 72 | original_rss = urllib.urlopen(uri).read() |
|---|
| 73 | doc = minidom.parseString(original_rss) |
|---|
| 74 | for item in doc.getElementsByTagName(itemtag): |
|---|
| 75 | for x in item.getElementsByTagName(targettag): |
|---|
| 76 | for text in filter(lambda x: x.nodeType == 3, x.childNodes): |
|---|
| 77 | html = text.data |
|---|
| 78 | for m in re.finditer(span_pat, html): |
|---|
| 79 | start,end = m.span() |
|---|
| 80 | html = html[start:end] |
|---|
| 81 | links = extract_links(html) |
|---|
| 82 | |
|---|
| 83 | res = [] |
|---|
| 84 | for (title,path) in links: |
|---|
| 85 | if path.startswith('/'): |
|---|
| 86 | path = 'http://' + host + path |
|---|
| 87 | if shortener: |
|---|
| 88 | path = shorten_url(path, shortener) |
|---|
| 89 | res.append((title, path)) |
|---|
| 90 | if len(res) == 0: |
|---|
| 91 | item.parentNode.removeChild(item) |
|---|
| 92 | else: |
|---|
| 93 | res = ', '.join([x+' '+y for (x,y) in res]) |
|---|
| 94 | text.data = res |
|---|
| 95 | |
|---|
| 96 | # TODO: description 空のとき、 item ごと削除する |
|---|
| 97 | print doc.toxml().encode('utf-8') |
|---|