| 64 | | } |
| 65 | | |
| 66 | | function __toString() { |
| 67 | | return $this->outertext(); |
| 68 | | } |
| 69 | | |
| 70 | | function __get($name) { |
| 71 | | if (isset($this->attr[$name])) return $this->attr[$name]; |
| 72 | | switch($name) { |
| 73 | | case 'outertext': return $this->outertext(); |
| 74 | | case 'innertext': return $this->innertext(); |
| 75 | | case 'plaintext': return $this->plaintext(); |
| 76 | | default: return array_key_exists($name, $this->attr); |
| 77 | | } |
| 78 | | } |
| 79 | | |
| 80 | | function __set($name, $value) { |
| 81 | | switch($name) { |
| 82 | | case 'outertext': return $this->info[HDOM_INFO_OUTER] = $value; |
| 83 | | case 'innertext': return $this->info[HDOM_INFO_INNER] = $value; |
| 84 | | case 'plaintext': return $this->info[HDOM_INFO_TEXT] = $value; |
| 85 | | } |
| 86 | | if (!isset($this->attr[$name])) { |
| 87 | | $this->info[HDOM_INFO_SPACE][] = array(' ', '', ''); |
| 88 | | $this->info[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
| 89 | | } |
| 90 | | $this->attr[$name] = $value; |
| 91 | | } |
| 92 | | |
| 93 | | function __isset($name) { |
| 94 | | switch($name) { |
| 95 | | case 'outertext': return true; |
| 96 | | case 'innertext': return true; |
| 97 | | case 'plaintext': return true; |
| 98 | | } |
| 99 | | //no value attr: nowrap, checked selected... |
| 100 | | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); |
| 169 | | // begin tag |
| 170 | | $ret = $this->dom->nodes[$this->info[HDOM_INFO_BEGIN]]->text(); |
| 171 | | // inner |
| 172 | | if (isset($this->info[HDOM_INFO_INNER])) $ret .= $this->info[HDOM_INFO_INNER]; |
| 173 | | else {foreach($this->nodes as $n) $ret .= $n->outertext();} |
| 174 | | // end tag |
| 175 | | if($this->info[HDOM_INFO_END]) $ret .= $this->dom->nodes[$this->info[HDOM_INFO_END]]->text($this->tag); |
| | 147 | |
| | 148 | // render begin tag |
| | 149 | $ret = $this->dom->nodes[$this->info[HDOM_INFO_BEGIN]]->makeup(); |
| | 150 | |
| | 151 | // render inner text |
| | 152 | if (isset($this->info[HDOM_INFO_INNER])) |
| | 153 | $ret .= $this->info[HDOM_INFO_INNER]; |
| | 154 | else { |
| | 155 | foreach($this->nodes as $n) |
| | 156 | $ret .= $n->outertext(); |
| | 157 | } |
| | 158 | // render end tag |
| | 159 | if($this->info[HDOM_INFO_END]) |
| | 160 | $ret .= $this->dom->nodes[$this->info[HDOM_INFO_END]]->makeup($this->tag); |
| | 161 | |
| 181 | | if ($this->nodetype==HDOM_TYPE_TEXT) return $this->info[HDOM_INFO_TEXT]; |
| | 167 | if (isset($this->info[HDOM_INFO_INNER])) return $this->info[HDOM_INFO_INNER]; |
| | 168 | switch ($this->nodetype) { |
| | 169 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]); |
| | 170 | case HDOM_TYPE_COMMENT: return ''; |
| | 171 | case HDOM_TYPE_UNKNOWN: return ''; |
| | 172 | } |
| | 173 | if (strcasecmp($this->tag, 'script')==0) return ''; |
| | 174 | if (strcasecmp($this->tag, 'style')==0) return ''; |
| 228 | | if (($levle=count($selectors))==0) return array(); |
| 229 | | |
| 230 | | $ret = array(); |
| 231 | | $head = array($this->info[HDOM_INFO_BEGIN]=>1); |
| 232 | | |
| 233 | | // no recursive! |
| 234 | | for ($l=0; $l<$levle; ++$l) { |
| 235 | | $op = $selectors[$l]['op']; |
| 236 | | $key = $selectors[$l]['key']; |
| 237 | | $val = $selectors[$l]['val']; |
| 238 | | $tag = $selectors[$l]['tag']; |
| 239 | | $exp = $selectors[$l]['exp']; |
| 240 | | if ($this->dom->lowercase) { |
| 241 | | if ($tag) $tag = strtolower($tag); |
| 242 | | if ($key) $key = strtolower($key); |
| 243 | | } |
| 244 | | |
| 245 | | $ret = array(); |
| | 226 | if (($count=count($selectors))==0) return array(); |
| | 227 | $found_keys = array(); |
| | 228 | |
| | 229 | // find each selector |
| | 230 | for ($c=0; $c<$count; ++$c) { |
| | 231 | if (($levle=count($selectors[0]))==0) return array(); |
| | 232 | $head = array($this->info[HDOM_INFO_BEGIN]=>1); |
| | 233 | |
| | 234 | // handle descendant selectors, no recursive! |
| | 235 | for ($l=0; $l<$levle; ++$l) { |
| | 236 | $ret = array(); |
| | 237 | foreach($head as $k=>$v) { |
| | 238 | $n = ($k==-1) ? $this->dom->root : $this->dom->nodes[$k]; |
| | 239 | $n->seek($selectors[$c][$l], $ret); |
| | 240 | } |
| | 241 | $head = $ret; |
| | 242 | } |
| | 243 | |
| 247 | | $n = ($k==-1) ? $this->dom->root : $this->dom->nodes[$k]; |
| 248 | | $n->seek($op, $tag, $key, $val, $exp, $ret); |
| 249 | | } |
| 250 | | $head = $ret; |
| 251 | | } |
| 252 | | |
| 253 | | $final = array(); |
| 254 | | foreach($head as $k=>$v) $final[] = $this->dom->nodes[$k]; |
| 255 | | |
| 256 | | if ($idx<0) return $final; |
| 257 | | return (isset($final[$idx])) ? $final[$idx] : null; |
| 258 | | } |
| 259 | | |
| 260 | | protected function parse_selector($selector) { |
| | 245 | if (!isset($found_keys[$k])) |
| | 246 | $found_keys[$k] = 1; |
| | 247 | } |
| | 248 | } |
| | 249 | |
| | 250 | // sort keys |
| | 251 | ksort($found_keys); |
| | 252 | |
| | 253 | $found = array(); |
| | 254 | foreach($found_keys as $k=>$v) |
| | 255 | $found[] = $this->dom->nodes[$k]; |
| | 256 | |
| | 257 | // return nth-element or array |
| | 258 | if ($idx<0) return $found; |
| | 259 | return (isset($found[$idx])) ? $found[$idx] : null; |
| | 260 | } |
| | 261 | |
| | 262 | protected function parse_selector($selector_string) { |
| | 263 | // pattern of CSS selectors, modified from mootools |
| | 264 | $pattern = "/([A-Za-z0-9_\\-:]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[(\w+)(?:([!*^$]?=)[\"']?([^\"']*)[\"']?)?])?/"; |
| | 265 | |
| | 266 | // handle multiple selectors |
| | 267 | $selector_list = split(',', $selector_string); |
| 263 | | // parse CSS selectors, pattern is modified from mootools |
| 264 | | $pattern = "/([!]?)([A-Za-z0-9_\\-:]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[(\w+)(?:([!*^$]?=)[\"']?([^\"']*)[\"']?)?])?/"; |
| 265 | | preg_match_all($pattern, trim($selector), $matches, PREG_SET_ORDER); |
| 266 | | |
| 267 | | foreach ($matches as $v) { |
| 268 | | $op = true; |
| 269 | | $key = null; |
| 270 | | $val = null; |
| 271 | | $tag = $v[2]; |
| 272 | | $exp = '='; |
| 273 | | if ($v[0]=='') continue; |
| 274 | | if ($v[1]=='!') $op = false; |
| 275 | | if(!empty($v[3])) {$key = 'id'; $val = $v[3];} |
| 276 | | if(!empty($v[4])) {$key = 'class'; $val = $v[4];} |
| 277 | | if(!empty($v[5])) { |
| 278 | | $key = $v[5]; |
| 279 | | if(!empty($v[6])) $exp = $v[6]; |
| 280 | | if(!empty($v[7])) $val = $v[7]; |
| 281 | | } |
| 282 | | $selectors[] = array('op'=>$op, 'tag'=>$tag, 'key'=>$key, 'val'=>$val, 'exp'=>$exp); |
| | 270 | foreach($selector_list as $selector) { |
| | 271 | $result = array(); |
| | 272 | preg_match_all($pattern, trim($selector), $matches, PREG_SET_ORDER); |
| | 273 | |
| | 274 | foreach ($matches as $m) { |
| | 275 | list($tag, $key, $val, $exp) = array($m[1], null, null, '='); |
| | 276 | |
| | 277 | if ($m[0]=='') continue; |
| | 278 | if(!empty($m[2])) {$key='id'; $val=$m[2];} |
| | 279 | if(!empty($m[3])) {$key='class'; $val=$m[3];} |
| | 280 | if(!empty($m[4])) {$key=$m[4];} |
| | 281 | if(!empty($m[5])) {$exp=$m[5];} |
| | 282 | if(!empty($m[6])) {$val=$m[6];} |
| | 283 | |
| | 284 | // convert to lowercase |
| | 285 | if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} |
| | 286 | |
| | 287 | $result[] = array($tag, $key, $val, $exp); |
| | 288 | } |
| | 289 | $selectors[] = $result; |
| 288 | | protected function seek($op, $tag, $key, $val, $exp, &$ret) { |
| 289 | | for($i=$this->info[HDOM_INFO_BEGIN]+1; $i<$this->info[HDOM_INFO_END]; ++$i) { |
| 290 | | $n = $this->dom->nodes[$i]; |
| 291 | | if ($n->nodetype==HDOM_TYPE_ENDTAG) continue; |
| | 295 | protected function seek($selector, &$ret) { |
| | 296 | list($tag, $key, $val, $exp) = $selector; |
| | 297 | |
| | 298 | $end = $this->info[HDOM_INFO_END]; |
| | 299 | if ($end==0) |
| | 300 | $end = $this->parent->info[HDOM_INFO_END]-1; |
| | 301 | |
| | 302 | for($i=$this->info[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { |
| | 303 | $node = $this->dom->nodes[$i]; |
| | 304 | if ($node->nodetype==HDOM_TYPE_ENDTAG) continue; |
| 296 | | $check = true; |
| 297 | | switch ($exp) { |
| 298 | | case '=': |
| 299 | | $check = ($n->attr[$key]===$val) ? true : false; break; |
| 300 | | case '!=': |
| 301 | | $check = ($n->attr[$key]!==$val) ? true : false; break; |
| 302 | | case '^=': |
| 303 | | $check = (preg_match("/^".preg_quote($val,'/')."/", $n->attr[$key])) ? true : false; break; |
| 304 | | case '$=': |
| 305 | | $check = (preg_match("/".preg_quote($val,'/')."$/", $n->attr[$key])) ? true : false; break; |
| 306 | | case '*=': |
| 307 | | $check = (preg_match("/".preg_quote($val,'/')."/", $n->attr[$key])) ? true : false; break; |
| | 313 | $check = $this->match($exp, $val, $node->attr[$key]); |
| | 314 | |
| | 315 | // handle multiple class |
| | 316 | if (!$check && strcasecmp($key, 'class')==0) { |
| | 317 | foreach(explode(' ',$node->attr[$key]) as $k) { |
| | 318 | $check = $this->match($exp, $val, $k); |
| | 319 | if ($check) break; |
| | 320 | } |
| 309 | | if (!isset($n->attr[$key]) || !$check) $pass = false; |
| 310 | | } |
| 311 | | if ($op) {if ($pass) $ret[$i] = 1;} |
| 312 | | else {if (!$pass) $ret[$i] = 1;} |
| 313 | | } |
| 314 | | unset($n); |
| | 322 | |
| | 323 | if (!$check) |
| | 324 | $pass = false; |
| | 325 | } |
| | 326 | |
| | 327 | if ($pass) |
| | 328 | $ret[$i] = 1; |
| | 329 | } |
| | 330 | unset($node); |
| | 331 | } |
| | 332 | |
| | 333 | protected function match($exp, $pattern, $value) { |
| | 334 | $check = true; |
| | 335 | switch ($exp) { |
| | 336 | case '=': |
| | 337 | $check = ($value===$pattern) ? true : false; break; |
| | 338 | case '!=': |
| | 339 | $check = ($value!==$pattern) ? true : false; break; |
| | 340 | case '^=': |
| | 341 | $check = (preg_match("/^".preg_quote($pattern,'/')."/", $value)) ? true : false; break; |
| | 342 | case '$=': |
| | 343 | $check = (preg_match("/".preg_quote($pattern,'/')."$/", $value)) ? true : false; break; |
| | 344 | case '*=': |
| | 345 | $check = (preg_match("/".preg_quote($pattern,'/')."/", $value)) ? true : false; break; |
| | 346 | } |
| | 347 | return $check; |
| | 348 | } |
| | 349 | |
| | 350 | function __toString() { |
| | 351 | return $this->outertext(); |
| | 352 | } |
| | 353 | |
| | 354 | function __get($name) { |
| | 355 | if (isset($this->attr[$name])) return $this->attr[$name]; |
| | 356 | switch($name) { |
| | 357 | case 'outertext': return $this->outertext(); |
| | 358 | case 'innertext': return $this->innertext(); |
| | 359 | case 'plaintext': return $this->plaintext(); |
| | 360 | default: return array_key_exists($name, $this->attr); |
| | 361 | } |
| | 362 | } |
| | 363 | |
| | 364 | function __set($name, $value) { |
| | 365 | switch($name) { |
| | 366 | case 'outertext': return $this->info[HDOM_INFO_OUTER] = $value; |
| | 367 | case 'innertext': return $this->info[HDOM_INFO_INNER] = $value; |
| | 368 | case 'plaintext': return $this->dom->restore_noise($this->info[HDOM_INFO_TEXT]); |
| | 369 | } |
| | 370 | if (!isset($this->attr[$name])) { |
| | 371 | $this->info[HDOM_INFO_SPACE][] = array(' ', '', ''); |
| | 372 | $this->info[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
| | 373 | } |
| | 374 | $this->attr[$name] = $value; |
| | 375 | } |
| | 376 | |
| | 377 | function __isset($name) { |
| | 378 | switch($name) { |
| | 379 | case 'outertext': return true; |
| | 380 | case 'innertext': return true; |
| | 381 | case 'plaintext': return true; |
| | 382 | } |
| | 383 | //no value attr: nowrap, checked selected... |
| | 384 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); |
| 458 | | } |
| 459 | | |
| 460 | | // remove noise from html content |
| 461 | | function remove_noise($pattern, $remove_tag=true, $remove_contents=true) { |
| 462 | | $count = preg_match_all($pattern, $this->html, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); |
| 463 | | |
| 464 | | for ($i=$count-1; $i>-1; --$i) { |
| 465 | | $key = '___noise___'.sprintf("% 3d", count($this->noise)); |
| 466 | | $idx = ($remove_tag) ? 0 : 1; |
| 467 | | $this->noise[$key] = ($remove_contents) ? '' : $matches[$i][$idx][0]; |
| 468 | | $this->html = substr_replace($this->html, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
| 469 | | } |
| 470 | | |
| 471 | | // reset the length of content |
| 472 | | $this->size = strlen($this->html); |
| 473 | | if ($this->size>0) $this->char = $this->html[0]; |
| 474 | | } |
| 475 | | |
| 476 | | // restore noise to html content |
| 477 | | protected function restore_noise($text) { |
| 478 | | while(($pos=strpos($text, '___noise___'))!==false) { |
| 479 | | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13]; |
| 480 | | if (isset($this->noise[$key])) |
| 481 | | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14); |
| 482 | | } |
| 483 | | return $text; |
| | 534 | unset($this->html); |
| | 535 | unset($this->noise); |
| 555 | | // comment |
| 556 | | if ($node->tag=='!--___noise___') { |
| 557 | | $node->nodetype = HDOM_TYPE_COMMENT; |
| 558 | | $node->info[HDOM_INFO_END] = 0; |
| 559 | | $node->info[HDOM_INFO_TEXT] = '<' . $node->tag . $this->copy_until_char_escape('>'); |
| 560 | | $node->tag = 'comment'; |
| | 606 | // doctype, cdata & comments... |
| | 607 | if (isset($node->tag[0]) && $node->tag[0]=='!') { |
| | 608 | $node->info[HDOM_INFO_TEXT] = '<' . $node->tag . $this->copy_until_char('>'); |
| | 609 | |
| | 610 | if (isset($node->tag[2]) && $node->tag[1]=='-' && $node->tag[2]=='-') { |
| | 611 | $node->nodetype = HDOM_TYPE_COMMENT; |
| | 612 | $node->tag = 'comment'; |
| | 613 | } else { |
| | 614 | $node->nodetype = HDOM_TYPE_UNKNOWN; |
| | 615 | $node->tag = 'unknown'; |
| | 616 | } |
| | 617 | |
| 709 | | $ret = ''; |
| 710 | | while ($this->char!=$char && $this->char!==null) { |
| 711 | | $ret .= $this->char; |
| 712 | | $this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next |
| 713 | | } |
| | 758 | if ($this->char===null) return ''; |
| | 759 | |
| | 760 | if (($pos = strpos($this->html, $char, $this->pos))===false) { |
| | 761 | $ret = substr($this->html, $this->pos, $this->size-$this->pos); |
| | 762 | $this->char = null; |
| | 763 | $this->pos = $this->size; |
| | 764 | return $ret; |
| | 765 | } |
| | 766 | |
| | 767 | if ($pos==$this->pos) return ''; |
| | 768 | |
| | 769 | $ret = substr($this->html, $this->pos, $pos-$this->pos); |
| | 770 | $this->char = $this->html[$pos]; |
| | 771 | $this->pos = $pos; |
| 718 | | $ret = ''; |
| 719 | | while ($this->char!=$char && $this->char!==null) { |
| 720 | | // ignore string escape |
| 721 | | if ($this->char=='\\') { |
| 722 | | $ret .= $this->char; |
| 723 | | $this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next |
| 724 | | } |
| 725 | | $ret .= $this->char; |
| 726 | | $this->char = (++$this->pos<$this->size) ? $this->html[$this->pos] : null; // next |
| 727 | | } |
| 728 | | return $ret; |
| | 776 | if ($this->char===null) return ''; |
| | 777 | |
| | 778 | $start = $this->pos; |
| | 779 | while(1) { |
| | 780 | if (($pos = strpos($this->html, $char, $start))===false) { |
| | 781 | $ret = substr($this->html, $this->pos, $this->size-$this->pos); |
| | 782 | $this->char = null; |
| | 783 | $this->pos = $this->size; |
| | 784 | return $ret; |
| | 785 | } |
| | 786 | |
| | 787 | if ($pos==$this->pos) return ''; |
| | 788 | |
| | 789 | if ($this->html[$pos-1]==='\\') { |
| | 790 | $start = $pos+1; |
| | 791 | continue; |
| | 792 | } |
| | 793 | |
| | 794 | $ret = substr($this->html, $this->pos, $pos-$this->pos); |
| | 795 | $this->char = $this->html[$pos]; |
| | 796 | $this->pos = $pos; |
| | 797 | return $ret; |
| | 798 | } |
| | 799 | } |
| | 800 | |
| | 801 | // remove noise from html content |
| | 802 | function remove_noise($pattern, $remove_tag=true, $remove_contents=true) { |
| | 803 | $count = preg_match_all($pattern, $this->html, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); |
| | 804 | |
| | 805 | for ($i=$count-1; $i>-1; --$i) { |
| | 806 | $key = '___noise___'.sprintf('% 3d', count($this->noise)); |
| | 807 | $idx = ($remove_tag) ? 0 : 1; |
| | 808 | $this->noise[$key] = ($remove_contents) ? '' : $matches[$i][$idx][0]; |
| | 809 | $this->html = substr_replace($this->html, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
| | 810 | } |
| | 811 | |
| | 812 | // reset the length of content |
| | 813 | $this->size = strlen($this->html); |
| | 814 | if ($this->size>0) |
| | 815 | $this->char = $this->html[0]; |
| | 816 | } |
| | 817 | |
| | 818 | // restore noise to html content |
| | 819 | function restore_noise($text) { |
| | 820 | while(($pos=strpos($text, '___noise___'))!==false) { |
| | 821 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13]; |
| | 822 | if (isset($this->noise[$key])) |
| | 823 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14); |
| | 824 | } |
| | 825 | return $text; |
| | 826 | } |
| | 827 | |
| | 828 | function __toString() { |
| | 829 | return $this->save(); |
| | 830 | } |
| | 831 | |
| | 832 | function __get($name) { |
| | 833 | switch($name) { |
| | 834 | case 'outertext': return $this->save(); |
| | 835 | case 'innertext': return $this->root->innertext(); |
| | 836 | case 'plaintext': return $this->root->plaintext(); |
| | 837 | } |