root/websites/appspot.com/filteroid/trunk/beta/rsslinker.py @ 36574

Revision 36574, 2.8 kB (checked in by whym, 5 years ago)

* rsslinker:

  • fix hostname
  • change output format
  • keep empty value in query string
Line 
1#! /usr/bin/env python
2# -*- encoding: utf-8 -*-
3
4# link extractor for rss
5
6# sample:
7# http://localhost:8080/beta/rss?uri=http://whym.tumblr.com/rss&fil=&sub=
8
9from xml.dom import minidom
10import urlparse
11import urllib
12import sys
13import re
14from cgi import parse_qsl
15import yaml
16import os
17
18from urllib import FancyURLopener
19class MyOpener(FancyURLopener):
20    version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
21myopener = MyOpener()
22urllib.urlopen = myopener.open
23urllib.urlretrieve = myopener.retrieve
24
25def get_args():
26    ret = yaml.load(open('rsslinker.yaml'))
27    if not ret:
28        ret = {}
29    args = os.environ
30    if args.has_key('QUERY_STRING'):
31        for (x,y) in parse_qsl(args['QUERY_STRING'], keep_blank_values=True):
32            ret[x] = y
33    return ret
34
35def extract_links(html):
36    ret = []
37    for m in re.finditer(r'<a .*?href="(.*?)".*?>(.*?)</a>', html):
38        ret.append((m.group(2), m.group(1)))
39    return ret
40
41def shorten_url(url, shortener, tag='shortUrl'):
42    doc = minidom.parseString(urllib.urlopen(shortener % urllib.quote(url)).read())
43    url = doc.getElementsByTagName(tag)[0].firstChild.data
44    return url
45
46if __name__ == '__main__':
47    # TODO: エラーの時は text/plain などにする
48    print 'Content-Type: text/xml; charset="UTF-8"'
49    print ''
50
51    uri = None
52    targettag = 'description'
53    span_pat = r'.*'
54    shortener = None
55    itemtag = 'item'
56
57    h = get_args()
58    if h.has_key('uri'):
59        uri = h['uri']
60    if h.has_key('target'):
61        targettag = h['target']
62    if h.has_key('span'):
63        span_pat = re.compile(h['span'])
64    if h.has_key('shortener'):
65        shortener = h['shortener']
66
67    if not uri:
68        print '<error>no uri provided</error>'
69        sys.exit()
70
71    uri = urllib.urlopen(uri)
72    host = urlparse.urlparse(uri.geturl()).hostname
73    original_rss = uri.read()
74    doc = minidom.parseString(original_rss)
75    for item in doc.getElementsByTagName(itemtag):
76        for x in item.getElementsByTagName(targettag):
77            for text in filter(lambda x: x.nodeType == 3, x.childNodes):
78                html = text.data
79                for m in re.finditer(span_pat, html):
80                    start,end = m.span()
81                    html = html[start:end]
82                links = extract_links(html)
83
84                res = []
85                for (title,path) in links:
86                    if path.startswith('/'):
87                        path = 'http://' + host + path
88                    if shortener:
89                        path = shorten_url(path, shortener)
90                    res.append((title, path))
91                if len(res) == 0:
92                    item.parentNode.removeChild(item)
93                else:
94                    res = ' - '.join([x+' '+y for (x,y) in res]) + ' '
95                    text.data = res
96
97    print doc.toxml().encode('utf-8')
Note: See TracBrowser for help on using the browser.