- Timestamp:
- 09/29/08 20:15:18 (2 months ago)
- Location:
- lang/php/Scraper/library/Diggin
- Files:
-
- 1 added
- 4 modified
-
Scraper/Adapter/Htmlscraping.php (modified) (6 diffs)
-
Scraper/Strategy/Flexible.php (added)
-
Scraper/Strategy/Selector.php (modified) (1 diff)
-
Scraper/Strategy/Xpath.php (modified) (2 diffs)
-
Uri/Http.php (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
lang/php/Scraper/library/Diggin/Scraper/Adapter/Htmlscraping.php
r17631 r20204 72 72 73 73 if ($bases = $xml_object->xpath('//base[@href]')) { 74 $bases[0]['href'] = $this->getAbsoluteUrl((string) $bases[0]['href'], $this->config['url']); 74 require_once 'Diggin/Uri/Http.php'; 75 $bases[0]['href'] = Diggin_Uri_Http::getAbsoluteUrl((string) $bases[0]['href'], $this->config['url']); 75 76 } else { 76 77 if (!$xml_object->head) { … … 203 204 $responseBody = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $responseBody); 204 205 if (preg_match('/^\s*$/s', $responseBody)) { 205 throw new Exception('The entity body became empty after preprocessing.'); 206 require_once 'Diggin/Scraper/Adapter/Exception.php'; 207 throw new Diggin_Scraper_Adapter_Exception('The entity body became empty after preprocessing.'); 206 208 } 207 209 /* … … 230 232 @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS'); 231 233 if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($responseBody))) { 232 throw new Exception('Failed detecting character encoding.'); 234 require_once 'Diggin/Scraper/Adapter/Exception.php'; 235 throw new Diggin_Scraper_Adapter_Exception('Failed detecting character encoding.'); 233 236 } 234 237 } … … 240 243 foreach ($this->backup as $key => $value) { 241 244 if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) { 242 throw new Exception('Failed converting character encoding.'); 245 require_once 'Diggin/Scraper/Adapter/Exception.php'; 246 throw new Diggin_Scraper_Adapter_Exception('Failed converting character encoding.'); 243 247 } 244 248 } … … 306 310 307 311 /** 308 * @param string $name309 * @param string $string310 312 * @return mixed 311 313 */ … … 354 356 return $this; 355 357 } 356 357 /**358 * @param string $url359 * @param string $base_url360 * @return string361 */362 public static function getAbsoluteUrl($url, $base_url)363 {364 if (preg_match('/^[\w\+\-\.]+:/', $url) or false === $bases = @parse_url($base_url)) {365 return $url;366 } elseif (0 === strpos($url, '/')) {367 return "$bases[scheme]://$bases[host]".(isset($bases['port'])? ":$bases[port]": '').$url;368 } else {369 if (!isset($bases['path'])) {370 $bases['path'] = '/';371 }372 require_once 'Net/URL2.php';373 return "$bases[scheme]://$bases[host]".(isset($bases['port'])? ":$bases[port]": '').374 Net_URL2::resolvePath(substr($bases['path'], 0, strrpos($bases['path'], '/') +1).$url);375 }376 }377 358 } 378 359 -
lang/php/Scraper/library/Diggin/Scraper/Strategy/Selector.php
r17673 r20204 122 122 } elseif (strpos($process->type, '@') === 0) { 123 123 $strings = array(); 124 require_once 'Diggin/Uri/Http.php'; 124 125 foreach ($values as $value) { 125 //@todo 126 if (($process->type == '@href' OR $process->type == '@src') 127 && method_exists($this->getAdapter(), 'getAbsoluteUrl')) { 128 array_push($strings, $this->getAdapter()->getAbsoluteUrl((string)$value[substr($process->type, 1)], self::$_adapterconfig['url'])); 129 //require_once 'Diggin/Uri/Http.php'; 130 //array_push($strings, Diggin_Uri_Http::getAbsoluteUrl((string)$value[substr($process->type, 1)], self::$_adapterconfig['url'])); 126 if (($process->type == '@href' OR $process->type == '@src')) { 127 array_push($strings, Diggin_Uri_Http::getAbsoluteUrl((string)$value[substr($process->type, 1)], self::$_adapterconfig['url'])); 131 128 } else { 132 129 array_push($strings, (string) $value[substr($process->type, 1)]); -
lang/php/Scraper/library/Diggin/Scraper/Strategy/Xpath.php
r17673 r20204 123 123 } elseif (strpos($process->type, '@') === 0) { 124 124 $strings = array(); 125 require_once 'Diggin/ Scraper/Adapter/Htmlscraping.php';125 require_once 'Diggin/Uri/Http.php'; 126 126 foreach ($values as $value) { 127 127 if (($process->type == '@href' OR $process->type == '@src')) { 128 array_push($strings, Diggin_Scraper_Adapter_Htmlscraping::getAbsoluteUrl((string)$value[substr($process->type, 1)], self::$_adapterconfig['url'])); 129 130 //require_once 'Diggin/Uri/Http.php'; 131 //array_push($strings, Diggin_Uri_Http::getAbsoluteUrl((string)$value[substr($process->type, 1)], self::$_adapterconfig['url'])); 128 array_push($strings, Diggin_Uri_Http::getAbsoluteUrl((string)$value[substr($process->type, 1)], self::$_adapterconfig['url'])); 132 129 } else { 133 130 array_push($strings, (string) $value[substr($process->type, 1)]); … … 142 139 return $strings; 143 140 } 144 145 141 } 146 -
lang/php/Scraper/library/Diggin/Uri/Http.php
r17673 r20204 28 28 public static function getAbsoluteUrl($url, $base_url) 29 29 { 30 $parse = parse_url($url); 31 if (isset($parse["host"])) { 32 $build = $url; 30 //using pecl_http 31 if (extension_loaded('http')) { 32 static $base_url; 33 if (array_key_exists('host', parse_url($url))) { 34 return $url; 35 } else { 36 if (strpos(pathinfo(parse_url($base_url, PHP_URL_PATH), PATHINFO_DIRNAME), '/') === false) { 37 return http_build_url($base_url, array("path" => $url,), 38 HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT); 39 } else { 40 return http_build_url($base_url, array("path" => $url,), 41 HTTP_URL_JOIN_PATH | HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT); 42 } 43 } 44 //Net_URL2 ver 0.2.0 33 45 } else { 34 $uridir = pathinfo(parse_url($base_url, PHP_URL_PATH), PATHINFO_DIRNAME); 35 $slash = strpos($uridir, '/'); 36 if ($slash === false) { 37 $build = http_build_url($base_url, array("path" => $url,), 38 HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT); 39 } else { 40 $build = http_build_url($base_url, array("path" => $url,), 41 HTTP_URL_JOIN_PATH | HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT); 42 } 46 if (!class_exists('Net_URL2')) require_once 'Net/URL2.php'; 47 $neturl2 = new Net_URL2($base_url); 48 return $neturl2->resolve($url)->getUrl(); 43 49 } 44 45 return $build;46 50 } 47 51 } 52
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)