| 1 | using System;
|
|---|
| 2 | using System.Collections.Generic;
|
|---|
| 3 | using System.Text;
|
|---|
| 4 | using System.IO;
|
|---|
| 5 | using System.Xml;
|
|---|
| 6 | using System.Net;
|
|---|
| 7 |
|
|---|
| 8 | using Sgml;
|
|---|
| 9 | using Misuzilla.Web.CSSSelectorToXPath;
|
|---|
| 10 |
|
|---|
| 11 | namespace Misuzilla.Text.Scraper
|
|---|
| 12 | {
|
|---|
| 13 | public class Scraper
|
|---|
| 14 | {
|
|---|
| 15 | public static ScrapeDefine Define()
|
|---|
| 16 | {
|
|---|
| 17 | return new ScrapeDefine();
|
|---|
| 18 | }
|
|---|
| 19 |
|
|---|
| 20 | private Dictionary<String, Object> _values;
|
|---|
| 21 | private String[] _keys;
|
|---|
| 22 | private ScrapeDefine _scrapeDef;
|
|---|
| 23 | private ScrapeDefine.ProcessCommand[] _commands;
|
|---|
| 24 |
|
|---|
| 25 | public Scraper(ScrapeDefine scrapeDef, Dictionary<String, Object> values, String[] keys, ScrapeDefine.ProcessCommand[] commands)
|
|---|
| 26 | {
|
|---|
| 27 | _scrapeDef = scrapeDef;
|
|---|
| 28 | _values = values;
|
|---|
| 29 | _keys = keys;
|
|---|
| 30 | _commands = commands;
|
|---|
| 31 | }
|
|---|
| 32 |
|
|---|
| 33 | public Dictionary<String, Object> Scrape(Uri uri)
|
|---|
| 34 | {
|
|---|
| 35 | using (WebClient wClient = new WebClient())
|
|---|
| 36 | {
|
|---|
| 37 | return Scrape(wClient.DownloadString(uri));
|
|---|
| 38 | }
|
|---|
| 39 | }
|
|---|
| 40 | public Dictionary<String, Object> Scrape(String contentBody)
|
|---|
| 41 | {
|
|---|
| 42 | return Scrape(new StringReader(contentBody));
|
|---|
| 43 | }
|
|---|
| 44 | public Dictionary<String, Object> Scrape(TextReader textReader)
|
|---|
| 45 | {
|
|---|
| 46 | _values = new Dictionary<string, object>();
|
|---|
| 47 |
|
|---|
| 48 | XmlDocument xDoc = new XmlDocument();
|
|---|
| 49 | SgmlReader reader = new SgmlReader();
|
|---|
| 50 | reader.InputStream = textReader;
|
|---|
| 51 | reader.StripDocType = true;
|
|---|
| 52 | reader.DocType = "HTML";
|
|---|
| 53 | xDoc.Load(reader);
|
|---|
| 54 |
|
|---|
| 55 | foreach (ScrapeDefine.ProcessCommand cmd in _commands)
|
|---|
| 56 | {
|
|---|
| 57 | XmlNodeList nodes = xDoc.SelectNodes(cmd.XPath);
|
|---|
| 58 | if (nodes.Count == 0)
|
|---|
| 59 | continue;
|
|---|
| 60 |
|
|---|
| 61 | foreach (ScrapeArg args in cmd.Args)
|
|---|
| 62 | {
|
|---|
| 63 | if (args.ValueConverter != null)
|
|---|
| 64 | {
|
|---|
| 65 | String result = args.ValueConverter(nodes);
|
|---|
| 66 | _values[args.StoreName] = result;
|
|---|
| 67 | }
|
|---|
| 68 | else
|
|---|
| 69 | {
|
|---|
| 70 | if (nodes.Count == 1)
|
|---|
| 71 | {
|
|---|
| 72 | if (args.Value.StartsWith("@"))
|
|---|
| 73 | {
|
|---|
| 74 | _values[args.StoreName] = nodes[0].Attributes[args.Value.Substring(1)].Value;
|
|---|
| 75 | }
|
|---|
| 76 | else if (String.Compare(args.Value, "TEXT", true) == 0)
|
|---|
| 77 | {
|
|---|
| 78 | _values[args.StoreName] = nodes[0].InnerText;
|
|---|
| 79 | }
|
|---|
| 80 | }
|
|---|
| 81 | }
|
|---|
| 82 | }
|
|---|
| 83 | }
|
|---|
| 84 |
|
|---|
| 85 | //
|
|---|
| 86 | Dictionary<String, Object> retValues = new Dictionary<string, object>();
|
|---|
| 87 | foreach (String key in _keys)
|
|---|
| 88 | {
|
|---|
| 89 | retValues[key] = _values.ContainsKey(key) ? _values[key] : null;
|
|---|
| 90 | }
|
|---|
| 91 | return retValues;
|
|---|
| 92 | }
|
|---|
| 93 | }
|
|---|
| 94 |
|
|---|
| 95 | public class ScrapeArg
|
|---|
| 96 | {
|
|---|
| 97 | public ScrapeArg()
|
|---|
| 98 | {
|
|---|
| 99 | }
|
|---|
| 100 |
|
|---|
| 101 | public ScrapeArg(String storeName, String value)
|
|---|
| 102 | {
|
|---|
| 103 | StoreName = storeName;
|
|---|
| 104 | Value = value;
|
|---|
| 105 | }
|
|---|
| 106 | public ScrapeArg(String storeName, Converter<XmlNodeList, String> valueConverter)
|
|---|
| 107 | {
|
|---|
| 108 | StoreName = storeName;
|
|---|
| 109 | ValueConverter = valueConverter;
|
|---|
| 110 | }
|
|---|
| 111 |
|
|---|
| 112 | public String StoreName { get; set; }
|
|---|
| 113 | public Converter<XmlNodeList, String> ValueConverter { get; set; }
|
|---|
| 114 | public String Value { get; set; }
|
|---|
| 115 | }
|
|---|
| 116 |
|
|---|
| 117 | public class ScrapeDefine
|
|---|
| 118 | {
|
|---|
| 119 | private Queue<ProcessCommand> _commands = new Queue<ProcessCommand>();
|
|---|
| 120 | private Dictionary<String, Object> _values = new Dictionary<string, object>();
|
|---|
| 121 | private String[] _returnKeys;
|
|---|
| 122 | private Scraper _scraper;
|
|---|
| 123 |
|
|---|
| 124 | public ScrapeDefine()
|
|---|
| 125 | {
|
|---|
| 126 | }
|
|---|
| 127 |
|
|---|
| 128 | public ScrapeDefine Process(String xpathOrSelctor, params ScrapeArg[] args)
|
|---|
| 129 | {
|
|---|
| 130 | String xpath;
|
|---|
| 131 | if (!xpathOrSelctor.StartsWith("/"))
|
|---|
| 132 | {
|
|---|
| 133 | xpath = SelectorToXPathConverter.SelectorToXPath(xpathOrSelctor);
|
|---|
| 134 | }
|
|---|
| 135 | else
|
|---|
| 136 | {
|
|---|
| 137 | xpath = xpathOrSelctor;
|
|---|
| 138 | }
|
|---|
| 139 | _commands.Enqueue(new ProcessCommand(xpath, args));
|
|---|
| 140 | return this;
|
|---|
| 141 | }
|
|---|
| 142 |
|
|---|
| 143 | public Scraper Result(params String[] keys)
|
|---|
| 144 | {
|
|---|
| 145 | _returnKeys = keys;
|
|---|
| 146 | return new Scraper(this, _values, keys, _commands.ToArray());
|
|---|
| 147 | }
|
|---|
| 148 |
|
|---|
| 149 | public class ProcessCommand
|
|---|
| 150 | {
|
|---|
| 151 | public String XPath;
|
|---|
| 152 | public ScrapeArg[] Args;
|
|---|
| 153 |
|
|---|
| 154 | public ProcessCommand(String xpath, params ScrapeArg[] args)
|
|---|
| 155 | {
|
|---|
| 156 | XPath = xpath;
|
|---|
| 157 | Args = args;
|
|---|
| 158 | }
|
|---|
| 159 | }
|
|---|
| 160 | }
|
|---|
| 161 | }
|
|---|