Show
Ignore:
Timestamp:
04/16/08 22:49:38 (5 years ago)
Author:
drry
Message:

lang/javascript/jsAutoPageScraper/trunk/test/jsAutoPageScraper-test.js
lang/javascript/jsAutoPageScraper/trunk/test/yahoo_auction.js
lang/javascript/jsAutoPageScraper/trunk/js/jsAutoPageScraper.js:

  • trivial fixes and cleanups in comments.
  • et cetera.
Location:
lang/javascript/jsAutoPageScraper/trunk
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • lang/javascript/jsAutoPageScraper/trunk/js/jsAutoPageScraper.js

    r9424 r9590  
    4040  } : 
    4141  function(element, evt, func) { 
    42     element.detachEvent('on'+evt, func); 
     42    element.detachEvent('on' + evt, func); 
    4343  }; 
    4444 
    4545 
    4646/** 
    47  * Create hidden(invisible) iframe in order to scrape contents from DOM tree 
    48  * 
    49  * @param {String} url - URL path to scrape 
    50  * @param {Function} callback - Callback function, invoked when iframe document load complete 
     47 * Create a hidden(invisible) iframe in order to scrape contents from a DOM tree. 
     48 * 
     49 * @param {String} url - URL path to scrape. 
     50 * @param {Function} callback - Callback function, invoked when iframe document load completed. 
    5151 * @param {Object} scope - Scope object that callback to be applied. 
    5252 */ 
    5353function createIframe(url, callback, scope) { 
    5454  // ugly W/A to prevent cache... 
    55   if (url==window.location.href) { 
    56     url += url.indexOf('?')>=0 ? '&' : '?'; 
     55  if (url == window.location.href) { 
     56    url += url.indexOf('?') >= 0 ? '&' : '?'; 
    5757  } 
    5858  var iframeEl = windoc.createElement('iframe'); 
     
    7272 
    7373 
    74 // detect base URL from own script URL 
     74// detect base URL from own script URL. 
    7575var baseUrl = (function() { 
    7676  var scripts = windoc.getElementsByTagName('script'); 
    77   for (var i=0,len=scripts.length; i < len; i++) { 
    78     var m = (scripts[i].src || '').match(/^(.*\/)jsAutoPageScraper\.js$/); 
    79     if (m) return m[1]; 
    80   } 
     77  for (var i=scripts.length,m; i-- > 0; ) { 
     78    if (m = /^.*\/(?=jsAutoPageScraper\.js$)/.exec(scripts[i])) { 
     79      return m[0]; 
     80    } 
     81  } 
     82  return ''; 
    8183})(); 
    8284 
    83 // assuming JavaScript-XPath lib is located in following path : 
     85// assuming JavaScript-XPath lib is located in the following path : 
    8486var xpathLibUrl = baseUrl + 'javascript-xpath/javascript-xpath.js'; 
    8587 
    8688 
    8789/** 
    88  * Append JavaScript-XPath(http://coderepos.org/share/wiki/JavaScript-XPath) to HTML Document 
    89  * 
    90  * @param {Document} doc - HTML Document to add XPath lib 
    91  * @param {Function} callback - callback function, invoked when XPath lib loaded 
     90 * Append JavaScript-XPath <http://coderepos.org/share/wiki/JavaScript-XPath> to HTML Document. 
     91 * 
     92 * @param {Document} doc - HTML Document to add XPath lib. 
     93 * @param {Function} callback - callback function, invoked when XPath lib loaded. 
    9294 * @param {Object} scope - Scope object that callback to be applied. 
    9395 */ 
     
    99101    scriptEl.type = 'text/javascript'; 
    100102    scriptEl.src = xpathLibUrl; 
    101     var ctEl = doc.documentElement; 
    102     ctEl.appendChild(scriptEl); 
     103    doc.documentElement.appendChild(scriptEl); 
    103104    pollXPathLibLoadStatus(); 
    104105  } 
     
    117118 * Query items from HTML doc matching to the given XPath expression. 
    118119 * 
    119  * @param {Document} doc - HTML Document 
    120  * @param {String} xpath - XPath expression 
    121  * @param {Node} contextNode - context node to search elements by xpath 
     120 * @param {Document} doc - HTML Document. 
     121 * @param {String} xpath - XPath expression. 
     122 * @param {Node} contextNode - context node to search elements by xpath. 
    122123 * @returns XPath matching nodes 
    123124 * @type Node 
     
    135136 * Query one item from HTML doc matching to the given XPath expression. 
    136137 * 
    137  * @param {Document} doc - HTML Document 
    138  * @param {String} xpath - XPath expression 
    139  * @param {Node} contextNode - context node to search elements by xpath 
     138 * @param {Document} doc - HTML Document. 
     139 * @param {String} xpath - XPath expression. 
     140 * @param {Node} contextNode - context node to search elements by xpath. 
    140141 * @returns XPath matching node 
    141142 * @type Node 
     
    147148 
    148149/** 
    149  * Convert DOM to JavaScript Object 
    150  * 
    151  * @param {Node} elem - HTML Document Node 
    152  * @param {String|Object|Function} mapping - Mapping information 
     150 * Convert DOM to JavaScript Object. 
     151 * 
     152 * @param {Node} elem - HTML Document Node. 
     153 * @param {String|Object|Function} mapping - Mapping information. 
    153154 * @returns JavaScript object 
    154155 * @type Object 
     
    165166      return mapping(elem); 
    166167    case 'object' : 
    167       var o = {} 
     168      var o = {}; 
    168169      for (var k in mapping) { 
    169170        o[k] = convertDOM2Object(elem, mapping[k]); 
     
    182183/** 
    183184 * JSAutoPageScraper 
    184  *  
     185 * 
    185186 * Usage : <pre> 
    186187 var jscr = new JSAutoPageScraper('http://twitter.com/home', { 
     
    191192     author : './/td[contains(@class,"content")]/strong/a/text()', 
    192193     content : function(p) { return (p.textContent || p.innerText).replace(/\s+/g, ' '); } 
    193    }  
     194   } 
    194195 }); 
    195196 
     
    208209 * 
    209210 * @constructor 
    210  * @param {String} url - Page URL to start scraping 
    211  * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph", and "mapping". 
     211 * @param {String} url - Page URL to start scraping. 
     212 * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph" and "mapping". 
    212213 */ 
    213214var JSAutoPageScraper = function(url, info) { 
    214215  this.proxy = new ScrapeProxy(url, info); 
    215216  this.cmdQueue = []; 
    216 } 
     217}; 
    217218 
    218219 
     
    223224 
    224225  /** 
    225    * Scrape paragraphs by given count.  
    226    *  
    227    * @param {Number} count - Number of records to fetch.  
     226   * Scrape paragraphs by given count. 
     227   * 
     228   * @param {Number} count - Number of records to fetch. 
    228229   * @param {Function} callback - Callback function to get scraped records. 
    229230   * @param {Object} scope - Scope object that callback to be applied. 
     
    240241          count, 
    241242          function(rec) { records.push(rec) }, 
    242           function() { callback.call(scope, records); _this.force() }  
     243          function() { callback.call(scope, records); _this.force() } 
    243244        ] 
    244245      } 
     
    251252  /** 
    252253   * Skip and proceed cursor by given count. 
    253    *  
    254    * @param {Number} count - Number of paragraphs to skip.  
     254   * 
     255   * @param {Number} count - Number of paragraphs to skip. 
    255256   * @returns Self instance 
    256257   * @type JSAutoPageScraper 
     
    269270  /** 
    270271   * Refresh queued commands and wait next command by keeping current cursor and resources. 
    271    *  
     272   * 
    272273   * @returns Self instance 
    273274   * @type JSAutoPageScraper 
     
    281282  /** 
    282283   * Refresh queued commands and cleanup all resources pointing to scraping document. 
    283    *  
     284   * 
    284285   * @returns Self instance 
    285286   * @type JSAutoPageScraper 
     
    296297 
    297298  /** 
    298    * Evaluate delayed command 
     299   * Evaluate delayed command. 
    299300   */ 
    300301  force : function() { 
     
    302303    if (cmd) this.proxy[cmd.method].apply(this.proxy, cmd.args); 
    303304  } 
    304 } 
     305}; 
    305306 
    306307 
    307308/** 
    308309 * Proxy object that accesses scraping document. 
    309  *  
     310 * 
    310311 * @constructor 
    311  * @param {String} url - Page URL to start scraping 
    312  * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph", and "mapping". 
    313  */ 
    314 var ScrapeProxy = function(url, info) {  
     312 * @param {String} url - Page URL to start scraping. 
     313 * @param {Object} info - Siteinfo config, includes "nextLink", "paragraph" and "mapping". 
     314 */ 
     315var ScrapeProxy = function(url, info) { 
    315316  this.url = this.currUrl = url; 
    316317  this.siteinfo = info; 
    317318  this.converter = function(el) { 
    318319    return convertDOM2Object(el, info.mapping); 
    319   } 
    320 } 
     320  }; 
     321}; 
    321322 
    322323 
     
    327328 
    328329  /** 
    329    * Fetch records 
    330    *  
     330   * Fetch records. 
     331   * 
    331332   * @param {Number} count - Fetching count. 
    332333   * @param {Function} onScanRecord - Callbacked when scanning fetched record. 
     
    346347 
    347348  /** 
    348    * Clear and reset fetching state. 
     349   * Clear and reset fetching states. 
    349350   */ 
    350351  clear : function() { 
  • lang/javascript/jsAutoPageScraper/trunk/test/jsAutoPageScraper-test.js

    r9412 r9590  
    496496  var t = typeof o; 
    497497  if (t=='string') { 
    498     return '"'+o.replace(/"/g, '\\"').replace(/\n/g, '\\n')+'"'; 
     498    return '"'+o.replace(/(?=")/g, '\\').replace(/\r?\n|\r/g, '\\n')+'"'; 
    499499  } else if (t=='number' || t=='boolean' || o instanceof Date) { 
    500500    return (o).toString(); 
     
    525525    ta.value = debuglogs.join('\n'); 
    526526  } 
    527 } 
     527}; 
    528528 
    529529 
  • lang/javascript/jsAutoPageScraper/trunk/test/yahoo_auction.js

    r9413 r9590  
    99  nextLink: '//td[@align="right" and @width="1%"]/small/b[last()]/a', 
    1010  paragraph: '//tr[@bgcolor="#eeeeee"]/td[@align="left"]', 
    11   mapping : {  
     11  mapping : { 
    1212    link : './/a/@href', 
    1313    text : './/a/text()'