root/lang/python/hinagiku/hinagiku/core.py @ 195

Revision 195, 3.3 kB (checked in by ykzts, 6 years ago)

lang/python/hinagiku: added project

Line 
1# -*- coding: utf-8 -*-
2
3import sys
4import os
5import re
6import md5
7import urllib2
8import datetime
9import hinagiku
10import yaml
11from dateutil.tz import *
12from dateutil.parser import parse
13from genshi.template import MarkupTemplate
14
15class Hinagiku:
16  def __init__(self, config_path=None, template_path=None, output_path=None):
17    self.flag = dict()
18    self.config_path = config_path or "config.yaml"
19    if os.path.isfile(self.config_path):
20      config = open(self.config_path).read()
21      self.config = yaml.load(config)
22    else:
23      sys.exit("Configure file is not found.")
24    self.template_path = template_path or "template.xml"
25    if os.path.isfile(self.template_path):
26      template = open(self.template_path)
27      self.template = MarkupTemplate(template)
28    else:
29      sys.exit("Template file is not found")
30    self.output_path = output_path or "output.xml"
31    self.output = list()
32    self.cache_path = "cache.yaml"
33    if os.path.isfile(self.cache_path):
34      cache = open(self.cache_path)
35      self.cache = yaml.load(cache)
36    else:
37      self.cache = dict()
38    self.ignore_pattern = dict()
39    self.user_agent = "Hinagiku/%s" % hinagiku.__version__
40
41  def run(self):
42    self.crawl()
43    self.out()
44
45  def crawl(self):
46    for page in self.config:
47      d = dict()
48      uri = page["check"]
49      if page.get("ignore"):
50        self.ignore_pattern[uri] = page["ignore"]
51      date = self.get_last_modified(uri)
52      d["uri"] = page.get("link") or page["check"]
53      d["title"] = page["title"]
54      d["author"] = page["author"]
55      if date:
56        d["date"] = date
57      self.output.append(d)
58
59  def out(self):
60    writer = lambda file, value: open(file, "w").write(value)
61    writer(self.cache_path, yaml.dump(self.cache, default_flow_style=False))
62    self.output.sort(lambda x, y: cmp(x.get("date"), y.get("date")))
63    self.output.reverse()
64    writer(self.output_path, self.template.generate(pages=self.output).render())
65
66  def get_last_modified(self, uri):
67    d = self.cache.get(uri) or dict()
68    opts = dict()
69    req = urllib2.Request(uri)
70    req.add_header("User-Agent", self.user_agent)
71    if d.has_key("etag"):
72      req.add_header("If-None-Match", d["etag"])
73    if d.has_key("last_modified"):
74      req.add_header("If-Modified-Since", d["last_modified"])
75    try:
76      page = urllib2.urlopen(req)
77    except urllib2.HTTPError, e:
78      status = e.code
79      d["status"] = status
80    except urllib2.URLError:
81      d["ststus"] = "None"
82    else:
83      headers = page.headers
84      etag = headers.getheader("ETag")
85      if etag:
86        d["etag"] = etag
87      last_modified = headers.getheader("Last-Modified")
88      if last_modified:
89        last_modified = parse(last_modified)
90        last_modified = last_modified.isoformat()
91        d["last_modified"] = last_modified
92      else:
93        body = page.read()
94        if self.ignore_pattern.get(uri):
95          pattern = self.ignore_pattern[uri]
96          body = self.ignore_text(body, pattern)
97        body = md5.new(body).hexdigest()
98        if not body == d.get("body"):
99          date = datetime.datetime.now(tzutc())
100          date = date.isoformat()
101          d["last_modified"] = date
102          d["body"] = body
103      d["status"] = page.code
104    self.cache[uri] = d
105    return d.get("last_modified")
106
107  def ignore_text(self, body, pattern):
108    pattern = re.compile(pattern)
109    return pattern.sub("", body)
Note: See TracBrowser for help on using the browser.