root/websites/wassr-fav.appspot.com/BeautifulSoup.py @ 38949

Revision 16738, 74.6 kB (checked in by mattn, 6 years ago)

wassr-fav.appspot.comのソース

Line 
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7tree representation. It provides methods and Pythonic idioms that make
8it easy to navigate, search, and modify the tree.
9
10A well-formed XML/HTML document yields a well-formed data
11structure. An ill-formed XML/HTML document yields a correspondingly
12ill-formed data structure. If your document is only locally
13well-formed, you can use this library to find and process the
14well-formed part of it.
15
16Beautiful Soup works with Python 2.2 and up. It has no external
17dependencies, but you'll have more success at converting data to UTF-8
18if you also install these three packages:
19
20* chardet, for auto-detecting character encodings
21  http://chardet.feedparser.org/
22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23  by stock Python.
24  http://cjkpython.i18n.org/
25
26Beautiful Soup defines classes for two main parsing strategies:
27
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29   language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32   or invalid. This class has web browser-like heuristics for
33   obtaining a sensible parse tree in the face of common HTML errors.
34
35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36the encoding of an HTML or XML document, and converting it to
37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39For more than you ever wanted to know about Beautiful Soup, see the
40documentation:
41http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43Here, have some legalese:
44
45Copyright (c) 2004-2007, Leonard Richardson
46
47All rights reserved.
48
49Redistribution and use in source and binary forms, with or without
50modification, are permitted provided that the following conditions are
51met:
52
53  * Redistributions of source code must retain the above copyright
54    notice, this list of conditions and the following disclaimer.
55
56  * Redistributions in binary form must reproduce the above
57    copyright notice, this list of conditions and the following
58    disclaimer in the documentation and/or other materials provided
59    with the distribution.
60
61  * Neither the name of the the Beautiful Soup Consortium and All
62    Night Kosher Bakery nor the names of its contributors may be
63    used to endorse or promote products derived from this software
64    without specific prior written permission.
65
66THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77
78"""
79from __future__ import generators
80
81__author__ = "Leonard Richardson (leonardr@segfault.org)"
82__version__ = "3.0.6"
83__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
84__license__ = "New-style BSD"
85
86from sgmllib import SGMLParser, SGMLParseError
87import codecs
88import types
89import re
90import sgmllib
91try:
92  from htmlentitydefs import name2codepoint
93except ImportError:
94  name2codepoint = {}
95
96#This hack makes Beautiful Soup able to parse XML with namespaces
97sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
98
99DEFAULT_OUTPUT_ENCODING = "utf-8"
100
101# First, the classes that represent markup elements.
102
103class PageElement:
104    """Contains the navigational information for some part of the page
105    (either a tag or a piece of text)"""
106
107    def setup(self, parent=None, previous=None):
108        """Sets up the initial relations between this element and
109        other elements."""
110        self.parent = parent
111        self.previous = previous
112        self.next = None
113        self.previousSibling = None
114        self.nextSibling = None
115        if self.parent and self.parent.contents:
116            self.previousSibling = self.parent.contents[-1]
117            self.previousSibling.nextSibling = self
118
119    def replaceWith(self, replaceWith):
120        oldParent = self.parent
121        myIndex = self.parent.contents.index(self)
122        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
123            # We're replacing this element with one of its siblings.
124            index = self.parent.contents.index(replaceWith)
125            if index and index < myIndex:
126                # Furthermore, it comes before this element. That
127                # means that when we extract it, the index of this
128                # element will change.
129                myIndex = myIndex - 1
130        self.extract()
131        oldParent.insert(myIndex, replaceWith)
132
133    def extract(self):
134        """Destructively rips this element out of the tree."""
135        if self.parent:
136            try:
137                self.parent.contents.remove(self)
138            except ValueError:
139                pass
140
141        #Find the two elements that would be next to each other if
142        #this element (and any children) hadn't been parsed. Connect
143        #the two.
144        lastChild = self._lastRecursiveChild()
145        nextElement = lastChild.next
146
147        if self.previous:
148            self.previous.next = nextElement
149        if nextElement:
150            nextElement.previous = self.previous
151        self.previous = None
152        lastChild.next = None
153
154        self.parent = None
155        if self.previousSibling:
156            self.previousSibling.nextSibling = self.nextSibling
157        if self.nextSibling:
158            self.nextSibling.previousSibling = self.previousSibling
159        self.previousSibling = self.nextSibling = None
160        return self
161
162    def _lastRecursiveChild(self):
163        "Finds the last element beneath this object to be parsed."
164        lastChild = self
165        while hasattr(lastChild, 'contents') and lastChild.contents:
166            lastChild = lastChild.contents[-1]
167        return lastChild
168
169    def insert(self, position, newChild):
170        if (isinstance(newChild, basestring)
171            or isinstance(newChild, unicode)) \
172            and not isinstance(newChild, NavigableString):
173            newChild = NavigableString(newChild)
174
175        position =  min(position, len(self.contents))
176        if hasattr(newChild, 'parent') and newChild.parent != None:
177            # We're 'inserting' an element that's already one
178            # of this object's children.
179            if newChild.parent == self:
180                index = self.find(newChild)
181                if index and index < position:
182                    # Furthermore we're moving it further down the
183                    # list of this object's children. That means that
184                    # when we extract this element, our target index
185                    # will jump down one.
186                    position = position - 1
187            newChild.extract()
188
189        newChild.parent = self
190        previousChild = None
191        if position == 0:
192            newChild.previousSibling = None
193            newChild.previous = self
194        else:
195            previousChild = self.contents[position-1]
196            newChild.previousSibling = previousChild
197            newChild.previousSibling.nextSibling = newChild
198            newChild.previous = previousChild._lastRecursiveChild()
199        if newChild.previous:
200            newChild.previous.next = newChild
201
202        newChildsLastElement = newChild._lastRecursiveChild()
203
204        if position >= len(self.contents):
205            newChild.nextSibling = None
206
207            parent = self
208            parentsNextSibling = None
209            while not parentsNextSibling:
210                parentsNextSibling = parent.nextSibling
211                parent = parent.parent
212                if not parent: # This is the last element in the document.
213                    break
214            if parentsNextSibling:
215                newChildsLastElement.next = parentsNextSibling
216            else:
217                newChildsLastElement.next = None
218        else:
219            nextChild = self.contents[position]
220            newChild.nextSibling = nextChild
221            if newChild.nextSibling:
222                newChild.nextSibling.previousSibling = newChild
223            newChildsLastElement.next = nextChild
224
225        if newChildsLastElement.next:
226            newChildsLastElement.next.previous = newChildsLastElement
227        self.contents.insert(position, newChild)
228
229    def append(self, tag):
230        """Appends the given tag to the contents of this tag."""
231        self.insert(len(self.contents), tag)
232
233    def findNext(self, name=None, attrs={}, text=None, **kwargs):
234        """Returns the first item that matches the given criteria and
235        appears after this Tag in the document."""
236        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
237
238    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
239                    **kwargs):
240        """Returns all items that match the given criteria and appear
241        after this Tag in the document."""
242        return self._findAll(name, attrs, text, limit, self.nextGenerator,
243                             **kwargs)
244
245    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
246        """Returns the closest sibling to this Tag that matches the
247        given criteria and appears after this Tag in the document."""
248        return self._findOne(self.findNextSiblings, name, attrs, text,
249                             **kwargs)
250
251    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
252                         **kwargs):
253        """Returns the siblings of this Tag that match the given
254        criteria and appear after this Tag in the document."""
255        return self._findAll(name, attrs, text, limit,
256                             self.nextSiblingGenerator, **kwargs)
257    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
258
259    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
260        """Returns the first item that matches the given criteria and
261        appears before this Tag in the document."""
262        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
263
264    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
265                        **kwargs):
266        """Returns all items that match the given criteria and appear
267        before this Tag in the document."""
268        return self._findAll(name, attrs, text, limit, self.previousGenerator,
269                           **kwargs)
270    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
271
272    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
273        """Returns the closest sibling to this Tag that matches the
274        given criteria and appears before this Tag in the document."""
275        return self._findOne(self.findPreviousSiblings, name, attrs, text,
276                             **kwargs)
277
278    def findPreviousSiblings(self, name=None, attrs={}, text=None,
279                             limit=None, **kwargs):
280        """Returns the siblings of this Tag that match the given
281        criteria and appear before this Tag in the document."""
282        return self._findAll(name, attrs, text, limit,
283                             self.previousSiblingGenerator, **kwargs)
284    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
285
286    def findParent(self, name=None, attrs={}, **kwargs):
287        """Returns the closest parent of this Tag that matches the given
288        criteria."""
289        # NOTE: We can't use _findOne because findParents takes a different
290        # set of arguments.
291        r = None
292        l = self.findParents(name, attrs, 1)
293        if l:
294            r = l[0]
295        return r
296
297    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
298        """Returns the parents of this Tag that match the given
299        criteria."""
300
301        return self._findAll(name, attrs, None, limit, self.parentGenerator,
302                             **kwargs)
303    fetchParents = findParents # Compatibility with pre-3.x
304
305    #These methods do the real heavy lifting.
306
307    def _findOne(self, method, name, attrs, text, **kwargs):
308        r = None
309        l = method(name, attrs, text, 1, **kwargs)
310        if l:
311            r = l[0]
312        return r
313
314    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
315        "Iterates over a generator looking for things that match."
316
317        if isinstance(name, SoupStrainer):
318            strainer = name
319        else:
320            # Build a SoupStrainer
321            strainer = SoupStrainer(name, attrs, text, **kwargs)
322        results = ResultSet(strainer)
323        g = generator()
324        while True:
325            try:
326                i = g.next()
327            except StopIteration:
328                break
329            if i:
330                found = strainer.search(i)
331                if found:
332                    results.append(found)
333                    if limit and len(results) >= limit:
334                        break
335        return results
336
337    #These Generators can be used to navigate starting from both
338    #NavigableStrings and Tags.
339    def nextGenerator(self):
340        i = self
341        while i:
342            i = i.next
343            yield i
344
345    def nextSiblingGenerator(self):
346        i = self
347        while i:
348            i = i.nextSibling
349            yield i
350
351    def previousGenerator(self):
352        i = self
353        while i:
354            i = i.previous
355            yield i
356
357    def previousSiblingGenerator(self):
358        i = self
359        while i:
360            i = i.previousSibling
361            yield i
362
363    def parentGenerator(self):
364        i = self
365        while i:
366            i = i.parent
367            yield i
368
369    # Utility methods
370    def substituteEncoding(self, str, encoding=None):
371        encoding = encoding or "utf-8"
372        return str.replace("%SOUP-ENCODING%", encoding)
373
374    def toEncoding(self, s, encoding=None):
375        """Encodes an object to a string in some encoding, or to Unicode.
376        ."""
377        if isinstance(s, unicode):
378            if encoding:
379                s = s.encode(encoding)
380        elif isinstance(s, str):
381            if encoding:
382                s = s.encode(encoding)
383            else:
384                s = unicode(s)
385        else:
386            if encoding:
387                s  = self.toEncoding(str(s), encoding)
388            else:
389                s = unicode(s)
390        return s
391
392class NavigableString(unicode, PageElement):
393
394    def __getnewargs__(self):
395        return (NavigableString.__str__(self),)
396
397    def __getattr__(self, attr):
398        """text.string gives you text. This is for backwards
399        compatibility for Navigable*String, but for CData* it lets you
400        get the string without the CData wrapper."""
401        if attr == 'string':
402            return self
403        else:
404            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
405
406    def __unicode__(self):
407        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
408
409    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
410        if encoding:
411            return self.encode(encoding)
412        else:
413            return self
414
415class CData(NavigableString):
416
417    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
418        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
419
420class ProcessingInstruction(NavigableString):
421    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
422        output = self
423        if "%SOUP-ENCODING%" in output:
424            output = self.substituteEncoding(output, encoding)
425        return "<?%s?>" % self.toEncoding(output, encoding)
426
427class Comment(NavigableString):
428    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
429        return "<!--%s-->" % NavigableString.__str__(self, encoding)
430
431class Declaration(NavigableString):
432    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
433        return "<!%s>" % NavigableString.__str__(self, encoding)
434
435class Tag(PageElement):
436
437    """Represents a found HTML tag with its attributes and contents."""
438
439    def _invert(h):
440        "Cheap function to invert a hash."
441        i = {}
442        for k,v in h.items():
443            i[v] = k
444        return i
445
446    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
447                                      "quot" : '"',
448                                      "amp" : "&",
449                                      "lt" : "<",
450                                      "gt" : ">" }
451
452    XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
453
454    def _convertEntities(self, match):
455        """Used in a call to re.sub to replace HTML, XML, and numeric
456        entities with the appropriate Unicode characters. If HTML
457        entities are being converted, any unrecognized entities are
458        escaped."""
459        x = match.group(1)
460        if self.convertHTMLEntities and x in name2codepoint:
461            return unichr(name2codepoint[x])
462        elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
463            if self.convertXMLEntities:
464                return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
465            else:
466                return u'&%s;' % x
467        elif len(x) > 0 and x[0] == '#':
468            # Handle numeric entities
469            if len(x) > 1 and x[1] == 'x':
470                return unichr(int(x[2:], 16))
471            else:
472                return unichr(int(x[1:]))
473
474        elif self.escapeUnrecognizedEntities:
475            return u'&amp;%s;' % x
476        else:
477            return u'&%s;' % x
478
479    def __init__(self, parser, name, attrs=None, parent=None,
480                 previous=None):
481        "Basic constructor."
482
483        # We don't actually store the parser object: that lets extracted
484        # chunks be garbage-collected
485        self.parserClass = parser.__class__
486        self.isSelfClosing = parser.isSelfClosingTag(name)
487        self.name = name
488        if attrs == None:
489            attrs = []
490        self.attrs = attrs
491        self.contents = []
492        self.setup(parent, previous)
493        self.hidden = False
494        self.containsSubstitutions = False
495        self.convertHTMLEntities = parser.convertHTMLEntities
496        self.convertXMLEntities = parser.convertXMLEntities
497        self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
498
499        # Convert any HTML, XML, or numeric entities in the attribute values.
500        convert = lambda(k, val): (k,
501                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
502                                          self._convertEntities,
503                                          val))
504        self.attrs = map(convert, self.attrs)
505
506    def get(self, key, default=None):
507        """Returns the value of the 'key' attribute for the tag, or
508        the value given for 'default' if it doesn't have that
509        attribute."""
510        return self._getAttrMap().get(key, default)
511
512    def has_key(self, key):
513        return self._getAttrMap().has_key(key)
514
515    def __getitem__(self, key):
516        """tag[key] returns the value of the 'key' attribute for the tag,
517        and throws an exception if it's not there."""
518        return self._getAttrMap()[key]
519
520    def __iter__(self):
521        "Iterating over a tag iterates over its contents."
522        return iter(self.contents)
523
524    def __len__(self):
525        "The length of a tag is the length of its list of contents."
526        return len(self.contents)
527
528    def __contains__(self, x):
529        return x in self.contents
530
531    def __nonzero__(self):
532        "A tag is non-None even if it has no contents."
533        return True
534
535    def __setitem__(self, key, value):
536        """Setting tag[key] sets the value of the 'key' attribute for the
537        tag."""
538        self._getAttrMap()
539        self.attrMap[key] = value
540        found = False
541        for i in range(0, len(self.attrs)):
542            if self.attrs[i][0] == key:
543                self.attrs[i] = (key, value)
544                found = True
545        if not found:
546            self.attrs.append((key, value))
547        self._getAttrMap()[key] = value
548
549    def __delitem__(self, key):
550        "Deleting tag[key] deletes all 'key' attributes for the tag."
551        for item in self.attrs:
552            if item[0] == key:
553                self.attrs.remove(item)
554                #We don't break because bad HTML can define the same
555                #attribute multiple times.
556            self._getAttrMap()
557            if self.attrMap.has_key(key):
558                del self.attrMap[key]
559
560    def __call__(self, *args, **kwargs):
561        """Calling a tag like a function is the same as calling its
562        findAll() method. Eg. tag('a') returns a list of all the A tags
563        found within this tag."""
564        return apply(self.findAll, args, kwargs)
565
566    def __getattr__(self, tag):
567        #print "Getattr %s.%s" % (self.__class__, tag)
568        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
569            return self.find(tag[:-3])
570        elif tag.find('__') != 0:
571            return self.find(tag)
572        raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
573
574    def __eq__(self, other):
575        """Returns true iff this tag has the same name, the same attributes,
576        and the same contents (recursively) as the given tag.
577
578        NOTE: right now this will return false if two tags have the
579        same attributes in a different order. Should this be fixed?"""
580        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
581            return False
582        for i in range(0, len(self.contents)):
583            if self.contents[i] != other.contents[i]:
584                return False
585        return True
586
587    def __ne__(self, other):
588        """Returns true iff this tag is not identical to the other tag,
589        as defined in __eq__."""
590        return not self == other
591
592    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
593        """Renders this tag as a string."""
594        return self.__str__(encoding)
595
596    def __unicode__(self):
597        return self.__str__(None)
598
599    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
600                                           + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
601                                           + ")")
602
603    def _sub_entity(self, x):
604        """Used with a regular expression to substitute the
605        appropriate XML entity for an XML special character."""
606        return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
607
608    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
609                prettyPrint=False, indentLevel=0):
610        """Returns a string or Unicode representation of this tag and
611        its contents. To get Unicode, pass None for encoding.
612
613        NOTE: since Python's HTML parser consumes whitespace, this
614        method is not certain to reproduce the whitespace present in
615        the original string."""
616
617        encodedName = self.toEncoding(self.name, encoding)
618
619        attrs = []
620        if self.attrs:
621            for key, val in self.attrs:
622                fmt = '%s="%s"'
623                if isString(val):
624                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
625                        val = self.substituteEncoding(val, encoding)
626
627                    # The attribute value either:
628                    #
629                    # * Contains no embedded double quotes or single quotes.
630                    #   No problem: we enclose it in double quotes.
631                    # * Contains embedded single quotes. No problem:
632                    #   double quotes work here too.
633                    # * Contains embedded double quotes. No problem:
634                    #   we enclose it in single quotes.
635                    # * Embeds both single _and_ double quotes. This
636                    #   can't happen naturally, but it can happen if
637                    #   you modify an attribute value after parsing
638                    #   the document. Now we have a bit of a
639                    #   problem. We solve it by enclosing the
640                    #   attribute in single quotes, and escaping any
641                    #   embedded single quotes to XML entities.
642                    if '"' in val:
643                        fmt = "%s='%s'"
644                        if "'" in val:
645                            # TODO: replace with apos when
646                            # appropriate.
647                            val = val.replace("'", "&squot;")
648
649                    # Now we're okay w/r/t quotes. But the attribute
650                    # value might also contain angle brackets, or
651                    # ampersands that aren't part of entities. We need
652                    # to escape those to XML entities too.
653                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
654
655                attrs.append(fmt % (self.toEncoding(key, encoding),
656                                    self.toEncoding(val, encoding)))
657        close = ''
658        closeTag = ''
659        if self.isSelfClosing:
660            close = ' /'
661        else:
662            closeTag = '</%s>' % encodedName
663
664        indentTag, indentContents = 0, 0
665        if prettyPrint:
666            indentTag = indentLevel
667            space = (' ' * (indentTag-1))
668            indentContents = indentTag + 1
669        contents = self.renderContents(encoding, prettyPrint, indentContents)
670        if self.hidden:
671            s = contents
672        else:
673            s = []
674            attributeString = ''
675            if attrs:
676                attributeString = ' ' + ' '.join(attrs)
677            if prettyPrint:
678                s.append(space)
679            s.append('<%s%s%s>' % (encodedName, attributeString, close))
680            if prettyPrint:
681                s.append("\n")
682            s.append(contents)
683            if prettyPrint and contents and contents[-1] != "\n":
684                s.append("\n")
685            if prettyPrint and closeTag:
686                s.append(space)
687            s.append(closeTag)
688            if prettyPrint and closeTag and self.nextSibling:
689                s.append("\n")
690            s = ''.join(s)
691        return s
692
693    def decompose(self):
694        """Recursively destroys the contents of this tree."""
695        contents = [i for i in self.contents]
696        for i in contents:
697            if isinstance(i, Tag):
698                i.decompose()
699            else:
700                i.extract()
701        self.extract()
702
703    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
704        return self.__str__(encoding, True)
705
706    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
707                       prettyPrint=False, indentLevel=0):
708        """Renders the contents of this tag as a string in the given
709        encoding. If encoding is None, returns a Unicode string.."""
710        s=[]
711        for c in self:
712            text = None
713            if isinstance(c, NavigableString):
714                text = c.__str__(encoding)
715            elif isinstance(c, Tag):
716                s.append(c.__str__(encoding, prettyPrint, indentLevel))
717            if text and prettyPrint:
718                text = text.strip()
719            if text:
720                if prettyPrint:
721                    s.append(" " * (indentLevel-1))
722                s.append(text)
723                if prettyPrint:
724                    s.append("\n")
725        return ''.join(s)
726
727    #Soup methods
728
729    def find(self, name=None, attrs={}, recursive=True, text=None,
730             **kwargs):
731        """Return only the first child of this Tag matching the given
732        criteria."""
733        r = None
734        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
735        if l:
736            r = l[0]
737        return r
738    findChild = find
739
740    def findAll(self, name=None, attrs={}, recursive=True, text=None,
741                limit=None, **kwargs):
742        """Extracts a list of Tag objects that match the given
743        criteria.  You can specify the name of the Tag and any
744        attributes you want the Tag to have.
745
746        The value of a key-value pair in the 'attrs' map can be a
747        string, a list of strings, a regular expression object, or a
748        callable that takes a string and returns whether or not the
749        string matches for some custom definition of 'matches'. The
750        same is true of the tag name."""
751        generator = self.recursiveChildGenerator
752        if not recursive:
753            generator = self.childGenerator
754        return self._findAll(name, attrs, text, limit, generator, **kwargs)
755    findChildren = findAll
756
757    # Pre-3.x compatibility methods
758    first = find
759    fetch = findAll
760
761    def fetchText(self, text=None, recursive=True, limit=None):
762        return self.findAll(text=text, recursive=recursive, limit=limit)
763
764    def firstText(self, text=None, recursive=True):
765        return self.find(text=text, recursive=recursive)
766
767    #Private methods
768
769    def _getAttrMap(self):
770        """Initializes a map representation of this tag's attributes,
771        if not already initialized."""
772        if not getattr(self, 'attrMap'):
773            self.attrMap = {}
774            for (key, value) in self.attrs:
775                self.attrMap[key] = value
776        return self.attrMap
777
778    #Generator methods
779    def childGenerator(self):
780        for i in range(0, len(self.contents)):
781            yield self.contents[i]
782        raise StopIteration
783
784    def recursiveChildGenerator(self):
785        stack = [(self, 0)]
786        while stack:
787            tag, start = stack.pop()
788            if isinstance(tag, Tag):
789                for i in range(start, len(tag.contents)):
790                    a = tag.contents[i]
791                    yield a
792                    if isinstance(a, Tag) and tag.contents:
793                        if i < len(tag.contents) - 1:
794                            stack.append((tag, i+1))
795                        stack.append((a, 0))
796                        break
797        raise StopIteration
798
799# Next, a couple classes to represent queries and their results.
800class SoupStrainer:
801    """Encapsulates a number of ways of matching a markup element (tag or
802    text)."""
803
804    def __init__(self, name=None, attrs={}, text=None, **kwargs):
805        self.name = name
806        if isString(attrs):
807            kwargs['class'] = attrs
808            attrs = None
809        if kwargs:
810            if attrs:
811                attrs = attrs.copy()
812                attrs.update(kwargs)
813            else:
814                attrs = kwargs
815        self.attrs = attrs
816        self.text = text
817
818    def __str__(self):
819        if self.text:
820            return self.text
821        else:
822            return "%s|%s" % (self.name, self.attrs)
823
824    def searchTag(self, markupName=None, markupAttrs={}):
825        found = None
826        markup = None
827        if isinstance(markupName, Tag):
828            markup = markupName
829            markupAttrs = markup
830        callFunctionWithTagData = callable(self.name) \
831                                and not isinstance(markupName, Tag)
832
833        if (not self.name) \
834               or callFunctionWithTagData \
835               or (markup and self._matches(markup, self.name)) \
836               or (not markup and self._matches(markupName, self.name)):
837            if callFunctionWithTagData:
838                match = self.name(markupName, markupAttrs)
839            else:
840                match = True
841                markupAttrMap = None
842                for attr, matchAgainst in self.attrs.items():
843                    if not markupAttrMap:
844                         if hasattr(markupAttrs, 'get'):
845                            markupAttrMap = markupAttrs
846                         else:
847                            markupAttrMap = {}
848                            for k,v in markupAttrs:
849                                markupAttrMap[k] = v
850                    attrValue = markupAttrMap.get(attr)
851                    if not self._matches(attrValue, matchAgainst):
852                        match = False
853                        break
854            if match:
855                if markup:
856                    found = markup
857                else:
858                    found = markupName
859        return found
860
861    def search(self, markup):
862        #print 'looking for %s in %s' % (self, markup)
863        found = None
864        # If given a list of items, scan it for a text element that
865        # matches.
866        if isList(markup) and not isinstance(markup, Tag):
867            for element in markup:
868                if isinstance(element, NavigableString) \
869                       and self.search(element):
870                    found = element
871                    break
872        # If it's a Tag, make sure its name or attributes match.
873        # Don't bother with Tags if we're searching for text.
874        elif isinstance(markup, Tag):
875            if not self.text:
876                found = self.searchTag(markup)
877        # If it's text, make sure the text matches.
878        elif isinstance(markup, NavigableString) or \
879                 isString(markup):
880            if self._matches(markup, self.text):
881                found = markup
882        else:
883            raise Exception, "I don't know how to match against a %s" \
884                  % markup.__class__
885        return found
886
887    def _matches(self, markup, matchAgainst):
888        #print "Matching %s against %s" % (markup, matchAgainst)
889        result = False
890        if matchAgainst == True and type(matchAgainst) == types.BooleanType:
891            result = markup != None
892        elif callable(matchAgainst):
893            result = matchAgainst(markup)
894        else:
895            #Custom match methods take the tag as an argument, but all
896            #other ways of matching match the tag name as a string.
897            if isinstance(markup, Tag):
898                markup = markup.name
899            if markup and not isString(markup):
900                markup = unicode(markup)
901            #Now we know that chunk is either a string, or None.
902            if hasattr(matchAgainst, 'match'):
903                # It's a regexp object.
904                result = markup and matchAgainst.search(markup)
905            elif isList(matchAgainst):
906                result = markup in matchAgainst
907            elif hasattr(matchAgainst, 'items'):
908                result = markup.has_key(matchAgainst)
909            elif matchAgainst and isString(markup):
910                if isinstance(markup, unicode):
911                    matchAgainst = unicode(matchAgainst)
912                else:
913                    matchAgainst = str(matchAgainst)
914
915            if not result:
916                result = matchAgainst == markup
917        return result
918
919class ResultSet(list):
920    """A ResultSet is just a list that keeps track of the SoupStrainer
921    that created it."""
922    def __init__(self, source):
923        list.__init__([])
924        self.source = source
925
926# Now, some helper functions.
927
928def isList(l):
929    """Convenience method that works with all 2.x versions of Python
930    to determine whether or not something is listlike."""
931    return hasattr(l, '__iter__') \
932           or (type(l) in (types.ListType, types.TupleType))
933
934def isString(s):
935    """Convenience method that works with all 2.x versions of Python
936    to determine whether or not something is stringlike."""
937    try:
938        return isinstance(s, unicode) or isinstance(s, basestring)
939    except NameError:
940        return isinstance(s, str)
941
942def buildTagMap(default, *args):
943    """Turns a list of maps, lists, or scalars into a single map.
944    Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
945    NESTING_RESET_TAGS maps out of lists and partial maps."""
946    built = {}
947    for portion in args:
948        if hasattr(portion, 'items'):
949            #It's a map. Merge it.
950            for k,v in portion.items():
951                built[k] = v
952        elif isList(portion):
953            #It's a list. Map each item to the default.
954            for k in portion:
955                built[k] = default
956        else:
957            #It's a scalar. Map it to the default.
958            built[portion] = default
959    return built
960
961# Now, the parser classes.
962
963class BeautifulStoneSoup(Tag, SGMLParser):
964
965    """This class contains the basic parser and search code. It defines
966    a parser that knows nothing about tag behavior except for the
967    following:
968
969      You can't close a tag without closing all the tags it encloses.
970      That is, "<foo><bar></foo>" actually means
971      "<foo><bar></bar></foo>".
972
973    [Another possible explanation is "<foo><bar /></foo>", but since
974    this class defines no SELF_CLOSING_TAGS, it will never use that
975    explanation.]
976
977    This class is useful for parsing XML or made-up markup languages,
978    or when BeautifulSoup makes an assumption counter to what you were
979    expecting."""
980
981    SELF_CLOSING_TAGS = {}
982    NESTABLE_TAGS = {}
983    RESET_NESTING_TAGS = {}
984    QUOTE_TAGS = {}
985
986    MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
987                       lambda x: x.group(1) + ' />'),
988                      (re.compile('<!\s+([^<>]*)>'),
989                       lambda x: '<!' + x.group(1) + '>')
990                      ]
991
992    ROOT_TAG_NAME = u'[document]'
993
994    HTML_ENTITIES = "html"
995    XML_ENTITIES = "xml"
996    XHTML_ENTITIES = "xhtml"
997    # TODO: This only exists for backwards-compatibility
998    ALL_ENTITIES = XHTML_ENTITIES
999
1000    # Used when determining whether a text node is all whitespace and
1001    # can be replaced with a single space. A text node that contains
1002    # fancy Unicode spaces (usually non-breaking) should be left
1003    # alone.
1004    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1005
1006    def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1007                 markupMassage=True, smartQuotesTo=XML_ENTITIES,
1008                 convertEntities=None, selfClosingTags=None):
1009        """The Soup object is initialized as the 'root tag', and the
1010        provided markup (which can be a string or a file-like object)
1011        is fed into the underlying parser.
1012
1013        sgmllib will process most bad HTML, and the BeautifulSoup
1014        class has some tricks for dealing with some HTML that kills
1015        sgmllib, but Beautiful Soup can nonetheless choke or lose data
1016        if your data uses self-closing tags or declarations
1017        incorrectly.
1018
1019        By default, Beautiful Soup uses regexes to sanitize input,
1020        avoiding the vast majority of these problems. If the problems
1021        don't apply to you, pass in False for markupMassage, and
1022        you'll get better performance.
1023
1024        The default parser massage techniques fix the two most common
1025        instances of invalid HTML that choke sgmllib:
1026
1027         <br/> (No space between name of closing tag and tag close)
1028         <! --Comment--> (Extraneous whitespace in declaration)
1029
1030        You can pass in a custom list of (RE object, replace method)
1031        tuples to get Beautiful Soup to scrub your input the way you
1032        want."""
1033
1034        self.parseOnlyThese = parseOnlyThese
1035        self.fromEncoding = fromEncoding
1036        self.smartQuotesTo = smartQuotesTo
1037        self.convertEntities = convertEntities
1038        # Set the rules for how we'll deal with the entities we
1039        # encounter
1040        if self.convertEntities:
1041            # It doesn't make sense to convert encoded characters to
1042            # entities even while you're converting entities to Unicode.
1043            # Just convert it all to Unicode.
1044            self.smartQuotesTo = None
1045            if convertEntities == self.HTML_ENTITIES:
1046                self.convertXMLEntities = False
1047                self.convertHTMLEntities = True
1048                self.escapeUnrecognizedEntities = True
1049            elif convertEntities == self.XHTML_ENTITIES:
1050                self.convertXMLEntities = True
1051                self.convertHTMLEntities = True
1052                self.escapeUnrecognizedEntities = False
1053            elif convertEntities == self.XML_ENTITIES:
1054                self.convertXMLEntities = True
1055                self.convertHTMLEntities = False
1056                self.escapeUnrecognizedEntities = False
1057        else:
1058            self.convertXMLEntities = False
1059            self.convertHTMLEntities = False
1060            self.escapeUnrecognizedEntities = False
1061
1062        self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1063        SGMLParser.__init__(self)
1064
1065        if hasattr(markup, 'read'):        # It's a file-type object.
1066            markup = markup.read()
1067        self.markup = markup
1068        self.markupMassage = markupMassage
1069        try:
1070            self._feed()
1071        except StopParsing:
1072            pass
1073        self.markup = None                 # The markup can now be GCed
1074
1075    def convert_charref(self, name):
1076        """This method fixes a bug in Python's SGMLParser."""
1077        try:
1078            n = int(name)
1079        except ValueError:
1080            return
1081        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1082            return
1083        return self.convert_codepoint(n)
1084
1085    def _feed(self, inDocumentEncoding=None):
1086        # Convert the document to Unicode.
1087        markup = self.markup
1088        if isinstance(markup, unicode):
1089            if not hasattr(self, 'originalEncoding'):
1090                self.originalEncoding = None
1091        else:
1092            dammit = UnicodeDammit\
1093                     (markup, [self.fromEncoding, inDocumentEncoding],
1094                      smartQuotesTo=self.smartQuotesTo)
1095            markup = dammit.unicode
1096            self.originalEncoding = dammit.originalEncoding
1097        if markup:
1098            if self.markupMassage:
1099                if not isList(self.markupMassage):
1100                    self.markupMassage = self.MARKUP_MASSAGE
1101                for fix, m in self.markupMassage:
1102                    markup = fix.sub(m, markup)
1103                # TODO: We get rid of markupMassage so that the
1104                # soup object can be deepcopied later on. Some
1105                # Python installations can't copy regexes. If anyone
1106                # was relying on the existence of markupMassage, this
1107                # might cause problems.
1108                del(self.markupMassage)
1109        self.reset()
1110
1111        SGMLParser.feed(self, markup)
1112        # Close out any unfinished strings and close all the open tags.
1113        self.endData()
1114        while self.currentTag.name != self.ROOT_TAG_NAME:
1115            self.popTag()
1116
1117    def __getattr__(self, methodName):
1118        """This method routes method call requests to either the SGMLParser
1119        superclass or the Tag superclass, depending on the method name."""
1120        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1121
1122        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1123               or methodName.find('do_') == 0:
1124            return SGMLParser.__getattr__(self, methodName)
1125        elif methodName.find('__') != 0:
1126            return Tag.__getattr__(self, methodName)
1127        else:
1128            raise AttributeError
1129
1130    def isSelfClosingTag(self, name):
1131        """Returns true iff the given string is the name of a
1132        self-closing tag according to this parser."""
1133        return self.SELF_CLOSING_TAGS.has_key(name) \
1134               or self.instanceSelfClosingTags.has_key(name)
1135
1136    def reset(self):
1137        Tag.__init__(self, self, self.ROOT_TAG_NAME)
1138        self.hidden = 1
1139        SGMLParser.reset(self)
1140        self.currentData = []
1141        self.currentTag = None
1142        self.tagStack = []
1143        self.quoteStack = []
1144        self.pushTag(self)
1145
1146    def popTag(self):
1147        tag = self.tagStack.pop()
1148        # Tags with just one string-owning child get the child as a
1149        # 'string' property, so that soup.tag.string is shorthand for
1150        # soup.tag.contents[0]
1151        if len(self.currentTag.contents) == 1 and \
1152           isinstance(self.currentTag.contents[0], NavigableString):
1153            self.currentTag.string = self.currentTag.contents[0]
1154
1155        #print "Pop", tag.name
1156        if self.tagStack:
1157            self.currentTag = self.tagStack[-1]
1158        return self.currentTag
1159
1160    def pushTag(self, tag):
1161        #print "Push", tag.name
1162        if self.currentTag:
1163            self.currentTag.contents.append(tag)
1164        self.tagStack.append(tag)
1165        self.currentTag = self.tagStack[-1]
1166
1167    def endData(self, containerClass=NavigableString):
1168        if self.currentData:
1169            currentData = ''.join(self.currentData)
1170            if not currentData.translate(self.STRIP_ASCII_SPACES):
1171                if '\n' in currentData:
1172                    currentData = '\n'
1173                else:
1174                    currentData = ' '
1175            self.currentData = []
1176            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1177                   (not self.parseOnlyThese.text or \
1178                    not self.parseOnlyThese.search(currentData)):
1179                return
1180            o = containerClass(currentData)
1181            o.setup(self.currentTag, self.previous)
1182            if self.previous:
1183                self.previous.next = o
1184            self.previous = o
1185            self.currentTag.contents.append(o)
1186
1187
1188    def _popToTag(self, name, inclusivePop=True):
1189        """Pops the tag stack up to and including the most recent
1190        instance of the given tag. If inclusivePop is false, pops the tag
1191        stack up to but *not* including the most recent instqance of
1192        the given tag."""
1193        #print "Popping to %s" % name
1194        if name == self.ROOT_TAG_NAME:
1195            return
1196
1197        numPops = 0
1198        mostRecentTag = None
1199        for i in range(len(self.tagStack)-1, 0, -1):
1200            if name == self.tagStack[i].name:
1201                numPops = len(self.tagStack)-i
1202                break
1203        if not inclusivePop:
1204            numPops = numPops - 1
1205
1206        for i in range(0, numPops):
1207            mostRecentTag = self.popTag()
1208        return mostRecentTag
1209
1210    def _smartPop(self, name):
1211
1212        """We need to pop up to the previous tag of this type, unless
1213        one of this tag's nesting reset triggers comes between this
1214        tag and the previous tag of this type, OR unless this tag is a
1215        generic nesting trigger and another generic nesting trigger
1216        comes between this tag and the previous tag of this type.
1217
1218        Examples:
1219         <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1220         <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1221         <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1222
1223         <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1224         <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1225         <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1226        """
1227
1228        nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1229        isNestable = nestingResetTriggers != None
1230        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1231        popTo = None
1232        inclusive = True
1233        for i in range(len(self.tagStack)-1, 0, -1):
1234            p = self.tagStack[i]
1235            if (not p or p.name == name) and not isNestable:
1236                #Non-nestable tags get popped to the top or to their
1237                #last occurance.
1238                popTo = name
1239                break
1240            if (nestingResetTriggers != None
1241                and p.name in nestingResetTriggers) \
1242                or (nestingResetTriggers == None and isResetNesting
1243                    and self.RESET_NESTING_TAGS.has_key(p.name)):
1244
1245                #If we encounter one of the nesting reset triggers
1246                #peculiar to this tag, or we encounter another tag
1247                #that causes nesting to reset, pop up to but not
1248                #including that tag.
1249                popTo = p.name
1250                inclusive = False
1251                break
1252            p = p.parent
1253        if popTo:
1254            self._popToTag(popTo, inclusive)
1255
1256    def unknown_starttag(self, name, attrs, selfClosing=0):
1257        #print "Start tag %s: %s" % (name, attrs)
1258        if self.quoteStack:
1259            #This is not a real tag.
1260            #print "<%s> is not real!" % name
1261            attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1262            self.handle_data('<%s%s>' % (name, attrs))
1263            return
1264        self.endData()
1265
1266        if not self.isSelfClosingTag(name) and not selfClosing:
1267            self._smartPop(name)
1268
1269        if self.parseOnlyThese and len(self.tagStack) <= 1 \
1270               and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1271            return
1272
1273        tag = Tag(self, name, attrs, self.currentTag, self.previous)
1274        if self.previous:
1275            self.previous.next = tag
1276        self.previous = tag
1277        self.pushTag(tag)
1278        if selfClosing or self.isSelfClosingTag(name):
1279            self.popTag()
1280        if name in self.QUOTE_TAGS:
1281            #print "Beginning quote (%s)" % name
1282            self.quoteStack.append(name)
1283            self.literal = 1
1284        return tag
1285
1286    def unknown_endtag(self, name):
1287        #print "End tag %s" % name
1288        if self.quoteStack and self.quoteStack[-1] != name:
1289            #This is not a real end tag.
1290            #print "</%s> is not real!" % name
1291            self.handle_data('</%s>' % name)
1292            return
1293        self.endData()
1294        self._popToTag(name)
1295        if self.quoteStack and self.quoteStack[-1] == name:
1296            self.quoteStack.pop()
1297            self.literal = (len(self.quoteStack) > 0)
1298
1299    def handle_data(self, data):
1300        self.currentData.append(data)
1301
1302    def _toStringSubclass(self, text, subclass):
1303        """Adds a certain piece of text to the tree as a NavigableString
1304        subclass."""
1305        self.endData()
1306        self.handle_data(text)
1307        self.endData(subclass)
1308
1309    def handle_pi(self, text):
1310        """Handle a processing instruction as a ProcessingInstruction
1311        object, possibly one with a %SOUP-ENCODING% slot into which an
1312        encoding will be plugged later."""
1313        if text[:3] == "xml":
1314            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1315        self._toStringSubclass(text, ProcessingInstruction)
1316
1317    def handle_comment(self, text):
1318        "Handle comments as Comment objects."
1319        self._toStringSubclass(text, Comment)
1320
1321    def handle_charref(self, ref):
1322        "Handle character references as data."
1323        if self.convertEntities:
1324            data = unichr(int(ref))
1325        else:
1326            data = '&#%s;' % ref
1327        self.handle_data(data)
1328
1329    def handle_entityref(self, ref):
1330        """Handle entity references as data, possibly converting known
1331        HTML and/or XML entity references to the corresponding Unicode
1332        characters."""
1333        data = None
1334        if self.convertHTMLEntities:
1335            try:
1336                data = unichr(name2codepoint[ref])
1337            except KeyError:
1338                pass
1339
1340        if not data and self.convertXMLEntities:
1341                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1342
1343        if not data and self.convertHTMLEntities and \
1344            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1345                # TODO: We've got a problem here. We're told this is
1346                # an entity reference, but it's not an XML entity
1347                # reference or an HTML entity reference. Nonetheless,
1348                # the logical thing to do is to pass it through as an
1349                # unrecognized entity reference.
1350                #
1351                # Except: when the input is "&carol;" this function
1352                # will be called with input "carol". When the input is
1353                # "AT&T", this function will be called with input
1354                # "T". We have no way of knowing whether a semicolon
1355                # was present originally, so we don't know whether
1356                # this is an unknown entity or just a misplaced
1357                # ampersand.
1358                #
1359                # The more common case is a misplaced ampersand, so I
1360                # escape the ampersand and omit the trailing semicolon.
1361                data = "&amp;%s" % ref
1362        if not data:
1363            # This case is different from the one above, because we
1364            # haven't already gone through a supposedly comprehensive
1365            # mapping of entities to Unicode characters. We might not
1366            # have gone through any mapping at all. So the chances are
1367            # very high that this is a real entity, and not a
1368            # misplaced ampersand.
1369            data = "&%s;" % ref
1370        self.handle_data(data)
1371
1372    def handle_decl(self, data):
1373        "Handle DOCTYPEs and the like as Declaration objects."
1374        self._toStringSubclass(data, Declaration)
1375
1376    def parse_declaration(self, i):
1377        """Treat a bogus SGML declaration as raw data. Treat a CDATA
1378        declaration as a CData object."""
1379        j = None
1380        if self.rawdata[i:i+9] == '<![CDATA[':
1381             k = self.rawdata.find(']]>', i)
1382             if k == -1:
1383                 k = len(self.rawdata)
1384             data = self.rawdata[i+9:k]
1385             j = k+3
1386             self._toStringSubclass(data, CData)
1387        else:
1388            try:
1389                j = SGMLParser.parse_declaration(self, i)
1390            except SGMLParseError:
1391                toHandle = self.rawdata[i:]
1392                self.handle_data(toHandle)
1393                j = i + len(toHandle)
1394        return j
1395
1396class BeautifulSoup(BeautifulStoneSoup):
1397
1398    """This parser knows the following facts about HTML:
1399
1400    * Some tags have no closing tag and should be interpreted as being
1401      closed as soon as they are encountered.
1402
1403    * The text inside some tags (ie. 'script') may contain tags which
1404      are not really part of the document and which should be parsed
1405      as text, not tags. If you want to parse the text as tags, you can
1406      always fetch it and parse it explicitly.
1407
1408    * Tag nesting rules:
1409
1410      Most tags can't be nested at all. For instance, the occurance of
1411      a <p> tag should implicitly close the previous <p> tag.
1412
1413       <p>Para1<p>Para2
1414        should be transformed into:
1415       <p>Para1</p><p>Para2
1416
1417      Some tags can be nested arbitrarily. For instance, the occurance
1418      of a <blockquote> tag should _not_ implicitly close the previous
1419      <blockquote> tag.
1420
1421       Alice said: <blockquote>Bob said: <blockquote>Blah
1422        should NOT be transformed into:
1423       Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1424
1425      Some tags can be nested, but the nesting is reset by the
1426      interposition of other tags. For instance, a <tr> tag should
1427      implicitly close the previous <tr> tag within the same <table>,
1428      but not close a <tr> tag in another table.
1429
1430       <table><tr>Blah<tr>Blah
1431        should be transformed into:
1432       <table><tr>Blah</tr><tr>Blah
1433        but,
1434       <tr>Blah<table><tr>Blah
1435        should NOT be transformed into
1436       <tr>Blah<table></tr><tr>Blah
1437
1438    Differing assumptions about tag nesting rules are a major source
1439    of problems with the BeautifulSoup class. If BeautifulSoup is not
1440    treating as nestable a tag your page author treats as nestable,
1441    try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1442    BeautifulStoneSoup before writing your own subclass."""
1443
1444    def __init__(self, *args, **kwargs):
1445        if not kwargs.has_key('smartQuotesTo'):
1446            kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1447        BeautifulStoneSoup.__init__(self, *args, **kwargs)
1448
1449    SELF_CLOSING_TAGS = buildTagMap(None,
1450                                    ['br' , 'hr', 'input', 'img', 'meta',
1451                                    'spacer', 'link', 'frame', 'base'])
1452
1453    QUOTE_TAGS = {'script' : None, 'textarea' : None}
1454
1455    #According to the HTML standard, each of these inline tags can
1456    #contain another tag of the same type. Furthermore, it's common
1457    #to actually use these tags this way.
1458    NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1459                            'center']
1460
1461    #According to the HTML standard, these block tags can contain
1462    #another tag of the same type. Furthermore, it's common
1463    #to actually use these tags this way.
1464    NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1465
1466    #Lists can contain other lists, but there are restrictions.
1467    NESTABLE_LIST_TAGS = { 'ol' : [],
1468                           'ul' : [],
1469                           'li' : ['ul', 'ol'],
1470                           'dl' : [],
1471                           'dd' : ['dl'],
1472                           'dt' : ['dl'] }
1473
1474    #Tables can contain other tables, but there are restrictions.
1475    NESTABLE_TABLE_TAGS = {'table' : [],
1476                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1477                           'td' : ['tr'],
1478                           'th' : ['tr'],
1479                           'thead' : ['table'],
1480                           'tbody' : ['table'],
1481                           'tfoot' : ['table'],
1482                           }
1483
1484    NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1485
1486    #If one of these tags is encountered, all tags up to the next tag of
1487    #this type are popped.
1488    RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1489                                     NON_NESTABLE_BLOCK_TAGS,
1490                                     NESTABLE_LIST_TAGS,
1491                                     NESTABLE_TABLE_TAGS)
1492
1493    NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1494                                NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1495
1496    # Used to detect the charset in a META tag; see start_meta
1497    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1498
1499    def start_meta(self, attrs):
1500        """Beautiful Soup can detect a charset included in a META tag,
1501        try to convert the document to that charset, and re-parse the
1502        document from the beginning."""
1503        httpEquiv = None
1504        contentType = None
1505        contentTypeIndex = None
1506        tagNeedsEncodingSubstitution = False
1507
1508        for i in range(0, len(attrs)):
1509            key, value = attrs[i]
1510            key = key.lower()
1511            if key == 'http-equiv':
1512                httpEquiv = value
1513            elif key == 'content':
1514                contentType = value
1515                contentTypeIndex = i
1516
1517        if httpEquiv and contentType: # It's an interesting meta tag.
1518            match = self.CHARSET_RE.search(contentType)
1519            if match:
1520                if getattr(self, 'declaredHTMLEncoding') or \
1521                       (self.originalEncoding == self.fromEncoding):
1522                    # This is our second pass through the document, or
1523                    # else an encoding was specified explicitly and it
1524                    # worked. Rewrite the meta tag.
1525                    newAttr = self.CHARSET_RE.sub\
1526                              (lambda(match):match.group(1) +
1527                               "%SOUP-ENCODING%", contentType)
1528                    attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1529                                               newAttr)
1530                    tagNeedsEncodingSubstitution = True
1531                else:
1532                    # This is our first pass through the document.
1533                    # Go through it again with the new information.
1534                    newCharset = match.group(3)
1535                    if newCharset and newCharset != self.originalEncoding:
1536                        self.declaredHTMLEncoding = newCharset
1537                        self._feed(self.declaredHTMLEncoding)
1538                        raise StopParsing
1539        tag = self.unknown_starttag("meta", attrs)
1540        if tag and tagNeedsEncodingSubstitution:
1541            tag.containsSubstitutions = True
1542
1543class StopParsing(Exception):
1544    pass
1545
1546class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1547
1548    """The BeautifulSoup class is oriented towards skipping over
1549    common HTML errors like unclosed tags. However, sometimes it makes
1550    errors of its own. For instance, consider this fragment:
1551
1552     <b>Foo<b>Bar</b></b>
1553
1554    This is perfectly valid (if bizarre) HTML. However, the
1555    BeautifulSoup class will implicitly close the first b tag when it
1556    encounters the second 'b'. It will think the author wrote
1557    "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1558    there's no real-world reason to bold something that's already
1559    bold. When it encounters '</b></b>' it will close two more 'b'
1560    tags, for a grand total of three tags closed instead of two. This
1561    can throw off the rest of your document structure. The same is
1562    true of a number of other tags, listed below.
1563
1564    It's much more common for someone to forget to close a 'b' tag
1565    than to actually use nested 'b' tags, and the BeautifulSoup class
1566    handles the common case. This class handles the not-co-common
1567    case: where you can't believe someone wrote what they did, but
1568    it's valid HTML and BeautifulSoup screwed up by assuming it
1569    wouldn't be."""
1570
1571    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1572     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1573      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1574      'big']
1575
1576    I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1577
1578    NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1579                                I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1580                                I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1581
1582class MinimalSoup(BeautifulSoup):
1583    """The MinimalSoup class is for parsing HTML that contains
1584    pathologically bad markup. It makes no assumptions about tag
1585    nesting, but it does know which tags are self-closing, that
1586    <script> tags contain Javascript and should not be parsed, that
1587    META tags may contain encoding information, and so on.
1588
1589    This also makes it better for subclassing than BeautifulStoneSoup
1590    or BeautifulSoup."""
1591
1592    RESET_NESTING_TAGS = buildTagMap('noscript')
1593    NESTABLE_TAGS = {}
1594
1595class BeautifulSOAP(BeautifulStoneSoup):
1596    """This class will push a tag with only a single string child into
1597    the tag's parent as an attribute. The attribute's name is the tag
1598    name, and the value is the string child. An example should give
1599    the flavor of the change:
1600
1601    <foo><bar>baz</bar></foo>
1602     =>
1603    <foo bar="baz"><bar>baz</bar></foo>
1604
1605    You can then access fooTag['bar'] instead of fooTag.barTag.string.
1606
1607    This is, of course, useful for scraping structures that tend to
1608    use subelements instead of attributes, such as SOAP messages. Note
1609    that it modifies its input, so don't print the modified version
1610    out.
1611
1612    I'm not sure how many people really want to use this class; let me
1613    know if you do. Mainly I like the name."""
1614
1615    def popTag(self):
1616        if len(self.tagStack) > 1:
1617            tag = self.tagStack[-1]
1618            parent = self.tagStack[-2]
1619            parent._getAttrMap()
1620            if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1621                isinstance(tag.contents[0], NavigableString) and
1622                not parent.attrMap.has_key(tag.name)):
1623                parent[tag.name] = tag.contents[0]
1624        BeautifulStoneSoup.popTag(self)
1625
1626#Enterprise class names! It has come to our attention that some people
1627#think the names of the Beautiful Soup parser classes are too silly
1628#and "unprofessional" for use in enterprise screen-scraping. We feel
1629#your pain! For such-minded folk, the Beautiful Soup Consortium And
1630#All-Night Kosher Bakery recommends renaming this file to
1631#"RobustParser.py" (or, in cases of extreme enterprisiness,
1632#"RobustParserBeanInterface.class") and using the following
1633#enterprise-friendly class aliases:
1634class RobustXMLParser(BeautifulStoneSoup):
1635    pass
1636class RobustHTMLParser(BeautifulSoup):
1637    pass
1638class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1639    pass
1640class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1641    pass
1642class SimplifyingSOAPParser(BeautifulSOAP):
1643    pass
1644
1645######################################################
1646#
1647# Bonus library: Unicode, Dammit
1648#
1649# This class forces XML data into a standard format (usually to UTF-8
1650# or Unicode).  It is heavily based on code from Mark Pilgrim's
1651# Universal Feed Parser. It does not rewrite the XML or HTML to
1652# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1653# (XML) and BeautifulSoup.start_meta (HTML).
1654
1655# Autodetects character encodings.
1656# Download from http://chardet.feedparser.org/
1657try:
1658    import chardet
1659#    import chardet.constants
1660#    chardet.constants._debug = 1
1661except ImportError:
1662    chardet = None
1663
1664# cjkcodecs and iconv_codec make Python know about more character encodings.
1665# Both are available from http://cjkpython.i18n.org/
1666# They're built in if you use Python 2.4.
1667try:
1668    import cjkcodecs.aliases
1669except ImportError:
1670    pass
1671try:
1672    import iconv_codec
1673except ImportError:
1674    pass
1675
1676class UnicodeDammit:
1677    """A class for detecting the encoding of a *ML document and
1678    converting it to a Unicode string. If the source encoding is
1679    windows-1252, can replace MS smart quotes with their HTML or XML
1680    equivalents."""
1681
1682    # This dictionary maps commonly seen values for "charset" in HTML
1683    # meta tags to the corresponding Python codec names. It only covers
1684    # values that aren't in Python's aliases and can't be determined
1685    # by the heuristics in find_codec.
1686    CHARSET_ALIASES = { "macintosh" : "mac-roman",
1687                        "x-sjis" : "shift-jis" }
1688
1689    def __init__(self, markup, overrideEncodings=[],
1690                 smartQuotesTo='xml'):
1691        self.markup, documentEncoding, sniffedEncoding = \
1692                     self._detectEncoding(markup)
1693        self.smartQuotesTo = smartQuotesTo
1694        self.triedEncodings = []
1695        if markup == '' or isinstance(markup, unicode):
1696            self.originalEncoding = None
1697            self.unicode = unicode(markup)
1698            return
1699
1700        u = None
1701        for proposedEncoding in overrideEncodings:
1702            u = self._convertFrom(proposedEncoding)
1703            if u: break
1704        if not u:
1705            for proposedEncoding in (documentEncoding, sniffedEncoding):
1706                u = self._convertFrom(proposedEncoding)
1707                if u: break
1708
1709        # If no luck and we have auto-detection library, try that:
1710        if not u and chardet and not isinstance(self.markup, unicode):
1711            u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1712
1713        # As a last resort, try utf-8 and windows-1252:
1714        if not u:
1715            for proposed_encoding in ("utf-8", "windows-1252"):
1716                u = self._convertFrom(proposed_encoding)
1717                if u: break
1718        self.unicode = u
1719        if not u: self.originalEncoding = None
1720
1721    def _subMSChar(self, orig):
1722        """Changes a MS smart quote character to an XML or HTML
1723        entity."""
1724        sub = self.MS_CHARS.get(orig)
1725        if type(sub) == types.TupleType:
1726            if self.smartQuotesTo == 'xml':
1727                sub = '&#x%s;' % sub[1]
1728            else:
1729                sub = '&%s;' % sub[0]
1730        return sub
1731
1732    def _convertFrom(self, proposed):
1733        proposed = self.find_codec(proposed)
1734        if not proposed or proposed in self.triedEncodings:
1735            return None
1736        self.triedEncodings.append(proposed)
1737        markup = self.markup
1738
1739        # Convert smart quotes to HTML if coming from an encoding
1740        # that might have them.
1741        if self.smartQuotesTo and proposed.lower() in("windows-1252",
1742                                                      "iso-8859-1",
1743                                                      "iso-8859-2"):
1744            markup = re.compile("([\x80-\x9f])").sub \
1745                     (lambda(x): self._subMSChar(x.group(1)),
1746                      markup)
1747
1748        try:
1749            # print "Trying to convert document to %s" % proposed
1750            u = self._toUnicode(markup, proposed)
1751            self.markup = u
1752            self.originalEncoding = proposed
1753        except Exception, e:
1754            # print "That didn't work!"
1755            # print e
1756            return None
1757        #print "Correct encoding: %s" % proposed
1758        return self.markup
1759
1760    def _toUnicode(self, data, encoding):
1761        '''Given a string and its encoding, decodes the string into Unicode.
1762        %encoding is a string recognized by encodings.aliases'''
1763
1764        # strip Byte Order Mark (if present)
1765        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1766               and (data[2:4] != '\x00\x00'):
1767            encoding = 'utf-16be'
1768            data = data[2:]
1769        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1770                 and (data[2:4] != '\x00\x00'):
1771            encoding = 'utf-16le'
1772            data = data[2:]
1773        elif data[:3] == '\xef\xbb\xbf':
1774            encoding = 'utf-8'
1775            data = data[3:]
1776        elif data[:4] == '\x00\x00\xfe\xff':
1777            encoding = 'utf-32be'
1778            data = data[4:]
1779        elif data[:4] == '\xff\xfe\x00\x00':
1780            encoding = 'utf-32le'
1781            data = data[4:]
1782        newdata = unicode(data, encoding)
1783        return newdata
1784
1785    def _detectEncoding(self, xml_data):
1786        """Given a document, tries to detect its XML encoding."""
1787        xml_encoding = sniffed_xml_encoding = None
1788        try:
1789            if xml_data[:4] == '\x4c\x6f\xa7\x94':
1790                # EBCDIC
1791                xml_data = self._ebcdic_to_ascii(xml_data)
1792            elif xml_data[:4] == '\x00\x3c\x00\x3f':
1793                # UTF-16BE
1794                sniffed_xml_encoding = 'utf-16be'
1795                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1796            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1797                     and (xml_data[2:4] != '\x00\x00'):
1798                # UTF-16BE with BOM
1799                sniffed_xml_encoding = 'utf-16be'
1800                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1801            elif xml_data[:4] == '\x3c\x00\x3f\x00':
1802                # UTF-16LE
1803                sniffed_xml_encoding = 'utf-16le'
1804                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1805            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1806                     (xml_data[2:4] != '\x00\x00'):
1807                # UTF-16LE with BOM
1808                sniffed_xml_encoding = 'utf-16le'
1809                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1810            elif xml_data[:4] == '\x00\x00\x00\x3c':
1811                # UTF-32BE
1812                sniffed_xml_encoding = 'utf-32be'
1813                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1814            elif xml_data[:4] == '\x3c\x00\x00\x00':
1815                # UTF-32LE
1816                sniffed_xml_encoding = 'utf-32le'
1817                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1818            elif xml_data[:4] == '\x00\x00\xfe\xff':
1819                # UTF-32BE with BOM
1820                sniffed_xml_encoding = 'utf-32be'
1821                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1822            elif xml_data[:4] == '\xff\xfe\x00\x00':
1823                # UTF-32LE with BOM
1824                sniffed_xml_encoding = 'utf-32le'
1825                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1826            elif xml_data[:3] == '\xef\xbb\xbf':
1827                # UTF-8 with BOM
1828                sniffed_xml_encoding = 'utf-8'
1829                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1830            else:
1831                sniffed_xml_encoding = 'ascii'
1832                pass
1833            xml_encoding_match = re.compile \
1834                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1835                                 .match(xml_data)
1836        except:
1837            xml_encoding_match = None
1838        if xml_encoding_match:
1839            xml_encoding = xml_encoding_match.groups()[0].lower()
1840            if sniffed_xml_encoding and \
1841               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1842                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1843                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
1844                                 'utf16', 'u16')):
1845                xml_encoding = sniffed_xml_encoding
1846        return xml_data, xml_encoding, sniffed_xml_encoding
1847
1848
1849    def find_codec(self, charset):
1850        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1851               or (charset and self._codec(charset.replace("-", ""))) \
1852               or (charset and self._codec(charset.replace("-", "_"))) \
1853               or charset
1854
1855    def _codec(self, charset):
1856        if not charset: return charset
1857        codec = None
1858        try:
1859            codecs.lookup(charset)
1860            codec = charset
1861        except (LookupError, ValueError):
1862            pass
1863        return codec
1864
1865    EBCDIC_TO_ASCII_MAP = None
1866    def _ebcdic_to_ascii(self, s):
1867        c = self.__class__
1868        if not c.EBCDIC_TO_ASCII_MAP:
1869            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1870                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1871                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1872                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1873                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1874                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1875                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1876                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1877                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1878                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1879                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1880                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1881                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1882                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1883                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1884                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1885                    250,251,252,253,254,255)
1886            import string
1887            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1888            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1889        return s.translate(c.EBCDIC_TO_ASCII_MAP)
1890
1891    MS_CHARS = { '\x80' : ('euro', '20AC'),
1892                 '\x81' : ' ',
1893                 '\x82' : ('sbquo', '201A'),
1894                 '\x83' : ('fnof', '192'),
1895                 '\x84' : ('bdquo', '201E'),
1896                 '\x85' : ('hellip', '2026'),
1897                 '\x86' : ('dagger', '2020'),
1898                 '\x87' : ('Dagger', '2021'),
1899                 '\x88' : ('circ', '2C6'),
1900                 '\x89' : ('permil', '2030'),
1901                 '\x8A' : ('Scaron', '160'),
1902                 '\x8B' : ('lsaquo', '2039'),
1903                 '\x8C' : ('OElig', '152'),
1904                 '\x8D' : '?',
1905                 '\x8E' : ('#x17D', '17D'),
1906                 '\x8F' : '?',
1907                 '\x90' : '?',
1908                 '\x91' : ('lsquo', '2018'),
1909                 '\x92' : ('rsquo', '2019'),
1910                 '\x93' : ('ldquo', '201C'),
1911                 '\x94' : ('rdquo', '201D'),
1912                 '\x95' : ('bull', '2022'),
1913                 '\x96' : ('ndash', '2013'),
1914                 '\x97' : ('mdash', '2014'),
1915                 '\x98' : ('tilde', '2DC'),
1916                 '\x99' : ('trade', '2122'),
1917                 '\x9a' : ('scaron', '161'),
1918                 '\x9b' : ('rsaquo', '203A'),
1919                 '\x9c' : ('oelig', '153'),
1920                 '\x9d' : '?',
1921                 '\x9e' : ('#x17E', '17E'),
1922                 '\x9f' : ('Yuml', ''),}
1923
1924#######################################################################
1925
1926
1927#By default, act as an HTML pretty-printer.
1928if __name__ == '__main__':
1929    import sys
1930    soup = BeautifulSoup(sys.stdin.read())
1931    print soup.prettify()
Note: See TracBrowser for help on using the browser.