root/lang/php/Scraper/library/Diggin/Scraper/Adapter/Htmlscraping.php @ 29139

Revision 29139, 11.6 kB (checked in by sasezaki, 4 years ago)
Line 
1<?php
2/**
3 * This class is remodeling of HTMLScraping
4 *
5 * @see http://www.rcdtokyo.com/etc/htmlscraping/
6 */
7
8/**
9 * ---------------------------------------------------------------------
10 * HTMLScraping class
11 * ---------------------------------------------------------------------
12 * PHP versions 5 (5.1.3 and later)
13 * ---------------------------------------------------------------------
14 * LICENSE: This source file is subject to the GNU Lesser General Public
15 * License as published by the Free Software Foundation;
16 * either version 2.1 of the License, or any later version
17 * that is available through the world-wide-web at the following URI:
18 * http://www.gnu.org/licenses/lgpl.html
19 * If you did not have a copy of the GNU Lesser General Public License
20 * and are unable to obtain it through the web, please write to
21 * the Free Software Foundation, Inc.,
22 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * ---------------------------------------------------------------------
24 */
25
26/**
27 * @see Diggin_Scraper_Adapter_Interface
28 */
29require_once 'Diggin/Scraper/Adapter/Interface.php';
30
31class Diggin_Scraper_Adapter_Htmlscraping implements Diggin_Scraper_Adapter_Interface
32{
33    /**
34     * Configuration array, set using the constructor or using ::setConfig()
35     *
36     * @var array
37     * @see http://tidy.sourceforge.net/docs/quickref.html
38     */
39    protected $config = array('tidy' => array('output-xhtml' => true, 'wrap' => 0));
40
41    /**
42     * @var array
43     */
44    private $backup = array();
45
46    /**
47     * @var integer
48     */
49    private $backup_count = 0;
50   
51    /**
52     * Casts a SimpleXMLElement
53     *
54     * @param Zend_Http_Response $response
55     * @return SimpleXMLElement
56     */
57    final public function getXmlObject($response)
58    {
59        try {
60            $xhtml = $this->getXhtml($response);
61        } catch (Exception $e) {
62            require_once 'Diggin/Scraper/Adapter/Exception.php';
63            throw new Diggin_Scraper_Adapter_Exception($e);
64        }
65       
66        /*
67         * Remove default namespace.
68         * This is because that SimpleXMLElement->registerXPathNamespace() may cause
69         * a problem under some circumstances (confirmed with PHP 5.1.6 so far).
70         * So you do not need to use SimpleXMLElement->registerXPathNamespace()
71         * when you use SimpleXMLElement->xpath().
72         */
73        $responseBody = preg_replace('/\sxmlns="[^"]+"/', '', $xhtml);
74        //$responseBody = preg_replace('/\sxmlns=\n?"[^"]+"/', '', $xhtml);
75       
76        try {
77            //@see http://php.net/libxml.constants
78            if (isset($this->config['libxmloptions'])) {
79                $xml_object = @new SimpleXMLElement($responseBody, $this->config['libxmloptions']);
80            } else {
81                $xml_object = @new SimpleXMLElement($responseBody);
82            }
83        } catch (Exception $e) {
84            require_once 'Diggin/Scraper/Adapter/Exception.php';
85            throw new Diggin_Scraper_Adapter_Exception($e);
86        }
87
88        return $xml_object;
89    }
90
91    /**
92     * Return XHTML string based on SimpleXML element.
93     *
94     * @param  object  $element
95     * @return string
96     */
97    final public function dumpElement($element)
98    {
99        return str_replace('&amp;', '&', $element->asXML());
100    }
101
102    /**
103     * Return array contains formated XHTML string
104     * created from the responded HTML of the given URL.
105     * array[code] => HTTP status code
106     * array[headers] => HTTP headers
107     * array[headers] => formated XHTML string made from the entity body
108     * Throw exception if error.
109     *
110     * @param  string  $url
111     * @param  string $responseBody
112     * @return string
113     * @throws Diggin_Scraper_Adapter_Exception
114     */
115    final public function getXhtml($response)
116    {
117        /*
118         * Remove BOM and NULLs.
119         */
120        $responseBody = preg_replace('/^\xef\xbb\xbf/', '' , $response->getBody());
121        $responseBody = str_replace("\x0", '', $responseBody);
122        /*
123         * Initialize the backups.
124         */
125        $this->backup = array();
126        $this->backup_count = 0;
127        /*
128         * Removing SCRIPT and STYLE is recommended.
129         * The following substitute code will capsulate the content of the tags in CDATA.
130         * If use it, be sure that some JavaScript method such as document.write
131         * is not compliant with XHTML/XML.
132         */
133        $tags = array('script', 'style');
134        foreach ($tags as $tag) {
135            $responseBody = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $responseBody);
136            /*
137            $responseBody = preg_replace_callback(
138                "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
139                create_function('$matches', '
140                    $content = trim($matches[2]);
141                    if (empty($content)
142                        or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
143                        return $matches[0];
144                    } else {
145                        $content = preg_replace("/^<!-+/", "", $content);
146                        $content = preg_replace("/-+>$/", "", $content);
147                        $content = preg_replace("/\s*\/\/$/s", "", trim($content));
148                        return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
149                    }
150                '),
151                $responseBody
152            );
153            */
154        }
155        /*
156         * Backup CDATA sections for later process.
157         */
158        $responseBody = preg_replace_callback(
159            '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $responseBody
160        );
161        /*
162         * Comment section must not contain two or more adjacent hyphens.
163         */
164        $responseBody = preg_replace_callback(
165            '/<!--(.*?)-->/si',
166            create_function('$matches', '
167                return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
168            '),
169            $responseBody
170        );
171        /*
172         * Backup comment sections for later process.
173         */
174        $responseBody = preg_replace_callback(
175            '/<!--.*?-->/s', array($this, 'backup'), $responseBody
176        );
177        /*
178         * Process tags that is potentially dangerous for XML parsers.
179         */
180        $responseBody = preg_replace_callback(
181            '/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si',
182            create_function('$matches', '
183                return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
184            '),
185            $responseBody
186        );
187        $responseBody = preg_replace_callback(
188            '/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si',
189            create_function('$matches', '
190                return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
191            '),
192            $responseBody
193        );
194        $responseBody = preg_replace_callback(
195            '/<plaintext\b[^>]*?>(.*)$/si',
196            create_function('$matches', '
197                return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
198            '),
199            $responseBody
200        );
201        /*
202         * Remove DTD declarations, wrongly placed comments etc.
203         * This must be done before removing DOCTYPE.
204         */
205        $responseBody = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $responseBody);
206        /*
207         * XML and DOCTYPE declaration will be replaced.
208         */
209        $responseBody = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $responseBody);
210        $responseBody = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $responseBody);
211        if (preg_match('/^\s*$/s', $responseBody)) {
212            require_once 'Diggin/Scraper/Adapter/Exception.php';
213            throw new Diggin_Scraper_Adapter_Exception('The entity body became empty after preprocessing.');
214        }
215       
216        require_once 'Diggin/Http/Response/Encoding.php';
217        list($responseBody, $this->backup) =
218        Diggin_Http_Response_Encoding::encode($responseBody,
219                                             $response->getHeader('content-type'),
220                                             'UTF-8',
221                                             $this->backup);
222        /*
223         * Restore CDATAs and comments.
224         */
225        for ($i = 0; $i < $this->backup_count; $i++) {
226            $responseBody = str_replace("<restore count=\"$i\" />", $this->backup[$i], $responseBody);
227        }
228        /*
229         * Use Tidy to format HTML if available.
230         * Otherwise, use HTMLParser class (is slower and consumes much memory).
231         */
232       
233        /*
234         * Replace every '&' with '&amp;'
235         * for XML parser not to break on non-predefined entities.
236         * So you may need to replace '&amp;' with '&'
237         * to have the original HTML string from returned SimpleXML object.
238         *
239         * //@see
240         * And tidy, it will replace htmlspecialchars('>' '<') to ('&lt;, '&gt;'')
241         * if not as Html Tag for tidy.
242         * so "str_replace('&')" before tidy.
243         */
244       
245        if (extension_loaded('tidy')) {
246            $responseBody = str_replace('&', '&amp;', $responseBody);
247            $tidy = new tidy;
248            $tidy->parseString($responseBody, $this->config['tidy'], 'UTF8');
249            $tidy->cleanRepair();
250            $responseBody = $tidy->html();
251        } else {
252            //@
253            $responseBody = str_replace('&', '&amp;', $responseBody);
254            require_once 'HTMLParser.class.php';
255            $parser = new HTMLParser;
256            $format_rule = require 'xhtml1-transitional_dtd.inc.php';
257            $parser->setRule($format_rule);
258            $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
259            $parser->setGenericParent('body');
260            $parser->parse($responseBody);
261            $responseBody = $parser->dump();
262        }
263        /*
264         * Valid XHTML DOCTYPE declaration (with DTD URI) is required
265         * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
266         */
267        $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
268        $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
269        $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
270       
271        return "$declarations$responseBody";
272    }
273
274    /**
275     * backup (Html and Xml comment)
276     *
277     * @param  array   $matches
278     * @return string
279     */
280    private function backup($matches)
281    {
282        $this->backup[] = $matches[0];
283        $replace = "<restore count=\"{$this->backup_count}\" />";
284        $this->backup_count++;
285       
286        return $replace;
287    }
288   
289    /**
290     * Reading Response as SimpleXmlElement
291     *
292     * @return SimplXmlElement
293     */
294    public function readData($response)
295    {
296        return $this->getXmlObject($response);
297    }
298   
299    /**
300     * Set configuration parameters for this
301     *
302     * @param array $config
303     * @return Diggin_Scraper_Adapter_Htmlscraping
304     * @throws Diggin_Scraper_Adapter_Exception
305     */
306    public function setConfig($config = array())
307    {
308        if (!is_array($config)) {
309            require_once 'Diggin/Scraper/Adapter/Exception.php';
310            throw new Diggin_Scraper_Adapter_Exception('Expected array parameter, given ' . gettype($config));
311        }
312       
313        if (isset($config['tidy']['output-xhtml']) && $config['tidy']['output-xhtml'] !== true) {
314            require_once 'Diggin/Scraper/Adapter/Exception.php';
315            throw new Diggin_Scraper_Adapter_Exception('tidy-config "output-xhtml" not as true - not allowed');
316        }
317       
318        foreach ($config as $k => $v) {
319            $this->config[strtolower($k)] = $v;
320        }
321       
322        return $this;
323    }
324}
Note: See TracBrowser for help on using the browser.