root/websites/appspot.com/filteroid/trunk/beta/rsslinker.py @ 36573

Revision 36573, 2.8 kB (checked in by whym, 5 years ago)

* rsslinker

Line 
1#! /usr/bin/env python
2# -*- encoding: utf-8 -*-
3
4# link extractor for rss
5
6# sample:
7# http://localhost:8080/beta/rss?uri=http://whym.tumblr.com/rss&fil=&sub=
8
9from xml.dom import minidom
10import urlparse
11import urllib
12import sys
13import re
14from cgi import parse_qsl
15import yaml
16import os
17
18from urllib import FancyURLopener
19class MyOpener(FancyURLopener):
20    version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
21myopener = MyOpener()
22urllib.urlopen = myopener.open
23urllib.urlretrieve = myopener.retrieve
24
25def get_args():
26    ret = yaml.load(open('rsslinker.yaml'))
27    if not ret:
28        ret = {}
29    args = os.environ
30    if args.has_key('QUERY_STRING'):
31        for (x,y) in parse_qsl(args['QUERY_STRING']):
32            ret[x] = y
33    return ret
34
35def extract_links(html):
36    ret = []
37    for m in re.finditer(r'<a .*?href="(.*?)".*?>(.*?)</a>', html):
38        ret.append((m.group(2), m.group(1)))
39    return ret
40
41def shorten_url(url, shortener, tag='shortUrl'):
42    doc = minidom.parseString(urllib.urlopen(shortener % urllib.quote(url)).read())
43    url = doc.getElementsByTagName(tag)[0].firstChild.data
44    return url
45
46if __name__ == '__main__':
47    print 'Content-Type: text/xml; charset="UTF-8"'
48    print ''
49
50    uri = None
51    targettag = 'description'
52    span_pat = r'.*'
53    shortener = None
54    itemtag = 'item'
55
56    h = get_args()
57    if h.has_key('uri'):
58        uri = h['uri']
59    if h.has_key('target'):
60        targettag = h['target']
61    if h.has_key('span'):
62        span_pat = re.compile(h['span'])
63    if h.has_key('shortener'):
64        shortener = h['shortener']
65
66    if not uri:
67        print '<error>no uri provided</error>'
68        sys.exit()
69
70    host = urlparse.urlparse(uri).hostname
71
72    original_rss = urllib.urlopen(uri).read()
73    doc = minidom.parseString(original_rss)
74    for item in doc.getElementsByTagName(itemtag):
75        for x in item.getElementsByTagName(targettag):
76            for text in filter(lambda x: x.nodeType == 3, x.childNodes):
77                html = text.data
78                for m in re.finditer(span_pat, html):
79                    start,end = m.span()
80                    html = html[start:end]
81                links = extract_links(html)
82
83                res = []
84                for (title,path) in links:
85                    if path.startswith('/'):
86                        path = 'http://' + host + path
87                    if shortener:
88                        path = shorten_url(path, shortener)
89                    res.append((title, path))
90                if len(res) == 0:
91                    item.parentNode.removeChild(item)
92                else:
93                    res = ', '.join([x+' '+y for (x,y) in res])
94                    text.data = res
95
96    # TODO: description 空のとき、 item ごと削除する
97    print doc.toxml().encode('utf-8')
Note: See TracBrowser for help on using the browser.