root/lang/python/cidr-mobilejp/trunk/scrape.py @ 4198

Revision 4198, 3.4 kB (checked in by tasuku, 5 years ago)

lang/python/cidr-mobilejp: bug fixed on 32bit arch. reported by id:hetima

  • Property svn:executable set to *
Line 
1#!/usr/bin/env python
2
3# -*- coding: utf-8 -*-
4
5import urllib
6import re
7
8class Docomo(object):
9  def name(self):
10    return 'docomo'
11
12  def url(self):
13    return 'http://www.nttdocomo.co.jp/service/imode/make/content/ip/'
14
15  def run(self):
16    content = urllib.urlopen(self.url()).read()
17    n = self.name()
18    return [(ip, n) for ip in re.findall(r'<li>([\d\./]+)</li>', content, re.M)]
19
20class Ezweb(object):
21  def name(self):
22    return 'ezweb'
23
24  def url(self):
25    return 'http://www.au.kddi.com/ezfactory/tec/spec/ezsava_ip.html'
26
27  def run(self):
28    content = urllib.urlopen(self.url()).read()
29    n = self.name()
30    pattern = r'<td>\s*<div class="TableText">([\d\.]+)</div>\s*</td>\s+<td>\s*<div class="TableText">(/\d+)</div>\s*</td>'
31    return [(ip + sn, n) for ip, sn in re.findall(pattern, content, re.M)]
32
33class Softbank(object):
34  def name(self):
35    return 'softbank'
36
37  def url(self):
38    return 'http://developers.softbankmobile.co.jp/dp/tech_svc/web/ip.php'
39
40  def run(self):
41    content = urllib.urlopen(self.url()).read()
42    n = self.name()
43    pattern = '<FONT size="2" class="j10".*?>([\d\./]+)</FONT>'
44    return [(ip, n) for ip in re.findall(pattern, content, re.M)]
45
46class AirHPhone(object):
47  def name(self):
48    return 'airhphone'
49
50  def url(self):
51    return 'http://www.willcom-inc.com/ja/service/contents_service/club_air_edge/for_phone/ip/'
52
53  def run(self):
54    content = urllib.urlopen(self.url()).read()
55    n = self.name()
56    pattern = '<td align="center" bgcolor="#f5f5f5" width="50%"><font size="2">([\d\./]+)</font></td>'
57    return [(ip, n) for ip in re.findall(pattern, content, re.M)]
58
59def get_cidr():
60  classes = [Docomo, Ezweb, Softbank, AirHPhone]
61  sources = []
62  for carrier in classes:
63    c = carrier()
64    sources += c.run()
65
66  # convert cidr to ipaddress
67  import socket, struct
68  pat = '([\d\.]+)/(\d+)'
69  ranges = []
70  for s in sources:
71    [(ip, bit)] = re.findall(pat, s[0], re.M)
72    ipnum = struct.unpack('>L', socket.inet_aton(ip))[0]
73    mask_ed = (1L << (32 - int(bit))) - 1
74    mask_st = ~mask_ed
75    ip_st = ipnum & mask_st
76    ip_ed = ip_st | mask_ed
77    ranges.append((ip_st, ip_ed, s[1]))
78
79  # sort
80  ranges.sort()
81
82  # merge adjacent range
83  preip_st = 0
84  preip_ed = 0
85  precarr = ''
86  merge_st = False
87  mranges = []
88  i = 0
89  while True:
90    st = i
91    try:
92      n = ranges[i + 1]
93      while ranges[i][1] + 1 == n[0] and \
94            ranges[i][2] == n[2]:
95        i += 1
96        n = ranges[i + 1]
97      mranges.append((ranges[st][0], ranges[i][1], ranges[st][2]))
98      i += 1
99    except IndexError, e:
100      mranges.append((ranges[st][0], ranges[i][1], ranges[st][2]))
101      break
102
103  # output php source
104  print """<?php
105function ip2mobile($ip) {
106  $n = sprintf('%u', ip2long($ip));
107"""
108  output_php(mranges, 0, len(mranges) - 1, 2)
109  print """  return 'pc';
110}
111?>
112"""
113
114def output_php(range, st, ed, ind):
115  # print st, ed, "\n"
116  if st > ed:
117    return
118  if st == ed:
119    print ' ' * ind + 'if ($n >= %d && $n <= %d) {' % (
120      range[st][0], range[st][1])
121    print ' ' * ind + "  return '%s';" % range[st][2]
122    print ' ' * ind + '}'
123    return
124  b = int((st + ed) / 2)
125  print ' ' * ind + 'if ($n < %d) {' % range[b][0]
126  output_php(range, st, b - 1, ind + 2)
127  print ' ' * ind + '} else if ($n <= %d) {' % range[b][1]
128  print ' ' * ind + "  return '%s';" % range[b][2]
129  print ' ' * ind + '} else {'
130  output_php(range, b + 1, ed, ind + 2)
131  print ' ' * ind + '}'
132
133if __name__ == '__main__':
134  get_cidr()
Note: See TracBrowser for help on using the browser.