当前位置:编程学习 > asp >>

用DOM实现文章采集--通过jquery语法式的方法采集指定对象的文本

[csharp]
/// <summary> 
/// DOM查询器,用法跟jquery差不多 
/// </summary> 
public class DomQuery 

    /// <summary> 
    /// 获得节点 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    /// <remarks>DOM选择器,用法跟jquery差不多</remarks> 
    public IList<HtmlNode> Get(HtmlDocument _HtmlDocument, string selector) 
    { 
        string[] Expressions = selector.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); 
 
        List<HtmlNode> hnList = new List<HtmlNode>(); 
 
        if (Expressions[0].StartsWith("#")) 
        { 
            hnList.Add(_HtmlDocument.GetElementbyId(Expressions[0].TrimStart('#'))); 
            hnList.RemoveAll(x => { return x == null; }); 
 
            if (Expressions.Length == 1) 
            { 
                return hnList; 
            } 
 
            for (int i = 1; i < Expressions.Length; i++) 
            { 
                hnList = Get(hnList, Expressions[i]); 
            } 
        } 
        else 
        { 
            hnList.AddRange(_HtmlDocument.DocumentNode.ChildNodes.Where(x => { return x.NodeType == HtmlNodeType.Element; })); 
 
            for (int i = 0; i < Expressions.Length; i++) 
            { 
                hnList = Get(hnList, Expressions[i]); 
            } 
        } 
 
 
 
 
 
        return hnList; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回InnerHtml 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string SingleGetInnerHtml(HtmlDocument _HtmlDocument, string selector) 
    { 
        HtmlNode hn = SingleGet(_HtmlDocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.InnerHtml; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回InnerText 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string SingleGetInnerText(HtmlDocument _HtmlDocument, string selector) 
    { 
        HtmlNode hn = SingleGet(_HtmlDocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.InnerText.Trim(); 
    } 
    /// <summary> 
    /// 查找节点 
    /// </summary> 
    /// <param name="_HtmlDocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public HtmlNode SingleGet(HtmlDocument _HtmlDocument, string selector) 
    { 
        IList<HtmlNode> hnList = Get(_HtmlDocument, selector); 
 
        if (hnList.Count == 0) 
        { 
            return null; 
        } 
        else 
        { 
            return hnList[0]; 
        } 
    } 
 
    #region 获得属性 
    /// <summary> 
    /// 获得属性 
    /// </summary> 
    /// <param name="_HtmlNodes"></param> 
    /// <param name="attr"></param> 
    /// <returns></returns> 
    public string[] Attr(IList<HtmlNode> _HtmlNodes, string attr) 
    { 
        if (_HtmlNodes == null) 
        { 
            return new string[0]; 
        } 
        if (_HtmlNodes.Count() == 0) 
        { 
            return new string[0]; 
        } 
        var v = from x in _HtmlNodes where x.At

补充:Web开发 , ASP.Net ,
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,