Changeset 9590 for lang/javascript/jsAutoPageScraper
- Timestamp:
- 04/16/08 22:49:38 (5 years ago)
- Location:
- lang/javascript/jsAutoPageScraper/trunk
- Files:
-
- 3 modified
-
js/jsAutoPageScraper.js (modified) (19 diffs)
-
test/jsAutoPageScraper-test.js (modified) (2 diffs)
-
test/yahoo_auction.js (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
lang/javascript/jsAutoPageScraper/trunk/js/jsAutoPageScraper.js
r9424 r9590 40 40 } : 41 41 function(element, evt, func) { 42 element.detachEvent('on' +evt, func);42 element.detachEvent('on' + evt, func); 43 43 }; 44 44 45 45 46 46 /** 47 * Create hidden(invisible) iframe in order to scrape contents from DOM tree48 * 49 * @param {String} url - URL path to scrape 50 * @param {Function} callback - Callback function, invoked when iframe document load complete 47 * Create a hidden(invisible) iframe in order to scrape contents from a DOM tree. 48 * 49 * @param {String} url - URL path to scrape. 50 * @param {Function} callback - Callback function, invoked when iframe document load completed. 51 51 * @param {Object} scope - Scope object that callback to be applied. 52 52 */ 53 53 function createIframe(url, callback, scope) { 54 54 // ugly W/A to prevent cache... 55 if (url ==window.location.href) {56 url += url.indexOf('?') >=0 ? '&' : '?';55 if (url == window.location.href) { 56 url += url.indexOf('?') >= 0 ? '&' : '?'; 57 57 } 58 58 var iframeEl = windoc.createElement('iframe'); … … 72 72 73 73 74 // detect base URL from own script URL 74 // detect base URL from own script URL. 75 75 var baseUrl = (function() { 76 76 var scripts = windoc.getElementsByTagName('script'); 77 for (var i=0,len=scripts.length; i < len; i++) { 78 var m = (scripts[i].src || '').match(/^(.*\/)jsAutoPageScraper\.js$/); 79 if (m) return m[1]; 80 } 77 for (var i=scripts.length,m; i-- > 0; ) { 78 if (m = /^.*\/(?=jsAutoPageScraper\.js$)/.exec(scripts[i])) { 79 return m[0]; 80 } 81 } 82 return ''; 81 83 })(); 82 84 83 // assuming JavaScript-XPath lib is located in following path :85 // assuming JavaScript-XPath lib is located in the following path : 84 86 var xpathLibUrl = baseUrl + 'javascript-xpath/javascript-xpath.js'; 85 87 86 88 87 89 /** 88 * Append JavaScript-XPath (http://coderepos.org/share/wiki/JavaScript-XPath) to HTML Document89 * 90 * @param {Document} doc - HTML Document to add XPath lib 91 * @param {Function} callback - callback function, invoked when XPath lib loaded 90 * Append JavaScript-XPath <http://coderepos.org/share/wiki/JavaScript-XPath> to HTML Document. 91 * 92 * @param {Document} doc - HTML Document to add XPath lib. 93 * @param {Function} callback - callback function, invoked when XPath lib loaded. 92 94 * @param {Object} scope - Scope object that callback to be applied. 93 95 */ … … 99 101 scriptEl.type = 'text/javascript'; 100 102 scriptEl.src = xpathLibUrl; 101 var ctEl = doc.documentElement; 102 ctEl.appendChild(scriptEl); 103 doc.documentElement.appendChild(scriptEl); 103 104 pollXPathLibLoadStatus(); 104 105 } … … 117 118 * Query items from HTML doc matching to the given XPath expression. 118 119 * 119 * @param {Document} doc - HTML Document 120 * @param {String} xpath - XPath expression 121 * @param {Node} contextNode - context node to search elements by xpath 120 * @param {Document} doc - HTML Document. 121 * @param {String} xpath - XPath expression. 122 * @param {Node} contextNode - context node to search elements by xpath. 122 123 * @returns XPath matching nodes 123 124 * @type Node … … 135 136 * Query one item from HTML doc matching to the given XPath expression. 136 137 * 137 * @param {Document} doc - HTML Document 138 * @param {String} xpath - XPath expression 139 * @param {Node} contextNode - context node to search elements by xpath 138 * @param {Document} doc - HTML Document. 139 * @param {String} xpath - XPath expression. 140 * @param {Node} contextNode - context node to search elements by xpath. 140 141 * @returns XPath matching node 141 142 * @type Node … … 147 148 148 149 /** 149 * Convert DOM to JavaScript Object 150 * 151 * @param {Node} elem - HTML Document Node 152 * @param {String|Object|Function} mapping - Mapping information 150 * Convert DOM to JavaScript Object. 151 * 152 * @param {Node} elem - HTML Document Node. 153 * @param {String|Object|Function} mapping - Mapping information. 153 154 * @returns JavaScript object 154 155 * @type Object … … 165 166 return mapping(elem); 166 167 case 'object' : 167 var o = {} 168 var o = {}; 168 169 for (var k in mapping) { 169 170 o[k] = convertDOM2Object(elem, mapping[k]); … … 182 183 /** 183 184 * JSAutoPageScraper 184 * 185 * 185 186 * Usage : <pre> 186 187 var jscr = new JSAutoPageScraper('http://twitter.com/home', { … … 191 192 author : './/td[contains(@class,"content")]/strong/a/text()', 192 193 content : function(p) { return (p.textContent || p.innerText).replace(/\s+/g, ' '); } 193 } 194 } 194 195 }); 195 196 … … 208 209 * 209 210 * @constructor 210 * @param {String} url - Page URL to start scraping 211 * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph" ,and "mapping".211 * @param {String} url - Page URL to start scraping. 212 * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph" and "mapping". 212 213 */ 213 214 var JSAutoPageScraper = function(url, info) { 214 215 this.proxy = new ScrapeProxy(url, info); 215 216 this.cmdQueue = []; 216 } 217 }; 217 218 218 219 … … 223 224 224 225 /** 225 * Scrape paragraphs by given count. 226 * 227 * @param {Number} count - Number of records to fetch. 226 * Scrape paragraphs by given count. 227 * 228 * @param {Number} count - Number of records to fetch. 228 229 * @param {Function} callback - Callback function to get scraped records. 229 230 * @param {Object} scope - Scope object that callback to be applied. … … 240 241 count, 241 242 function(rec) { records.push(rec) }, 242 function() { callback.call(scope, records); _this.force() } 243 function() { callback.call(scope, records); _this.force() } 243 244 ] 244 245 } … … 251 252 /** 252 253 * Skip and proceed cursor by given count. 253 * 254 * @param {Number} count - Number of paragraphs to skip. 254 * 255 * @param {Number} count - Number of paragraphs to skip. 255 256 * @returns Self instance 256 257 * @type JSAutoPageScraper … … 269 270 /** 270 271 * Refresh queued commands and wait next command by keeping current cursor and resources. 271 * 272 * 272 273 * @returns Self instance 273 274 * @type JSAutoPageScraper … … 281 282 /** 282 283 * Refresh queued commands and cleanup all resources pointing to scraping document. 283 * 284 * 284 285 * @returns Self instance 285 286 * @type JSAutoPageScraper … … 296 297 297 298 /** 298 * Evaluate delayed command 299 * Evaluate delayed command. 299 300 */ 300 301 force : function() { … … 302 303 if (cmd) this.proxy[cmd.method].apply(this.proxy, cmd.args); 303 304 } 304 } 305 }; 305 306 306 307 307 308 /** 308 309 * Proxy object that accesses scraping document. 309 * 310 * 310 311 * @constructor 311 * @param {String} url - Page URL to start scraping 312 * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph" ,and "mapping".313 */ 314 var ScrapeProxy = function(url, info) { 312 * @param {String} url - Page URL to start scraping. 313 * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph" and "mapping". 314 */ 315 var ScrapeProxy = function(url, info) { 315 316 this.url = this.currUrl = url; 316 317 this.siteinfo = info; 317 318 this.converter = function(el) { 318 319 return convertDOM2Object(el, info.mapping); 319 } 320 } 320 }; 321 }; 321 322 322 323 … … 327 328 328 329 /** 329 * Fetch records 330 * 330 * Fetch records. 331 * 331 332 * @param {Number} count - Fetching count. 332 333 * @param {Function} onScanRecord - Callbacked when scanning fetched record. … … 346 347 347 348 /** 348 * Clear and reset fetching state .349 * Clear and reset fetching states. 349 350 */ 350 351 clear : function() { -
lang/javascript/jsAutoPageScraper/trunk/test/jsAutoPageScraper-test.js
r9412 r9590 496 496 var t = typeof o; 497 497 if (t=='string') { 498 return '"'+o.replace(/ "/g, '\\"').replace(/\n/g, '\\n')+'"';498 return '"'+o.replace(/(?=")/g, '\\').replace(/\r?\n|\r/g, '\\n')+'"'; 499 499 } else if (t=='number' || t=='boolean' || o instanceof Date) { 500 500 return (o).toString(); … … 525 525 ta.value = debuglogs.join('\n'); 526 526 } 527 } 527 }; 528 528 529 529 -
lang/javascript/jsAutoPageScraper/trunk/test/yahoo_auction.js
r9413 r9590 9 9 nextLink: '//td[@align="right" and @width="1%"]/small/b[last()]/a', 10 10 paragraph: '//tr[@bgcolor="#eeeeee"]/td[@align="left"]', 11 mapping : { 11 mapping : { 12 12 link : './/a/@href', 13 13 text : './/a/text()'
![(please configure the [header_logo] section in trac.ini)](/share/chrome/site/your_project_logo.png)