|
/// <summary>
/// 根据输入的地址获取其文档节点对象
/// </summary>
/// <param name="url">地址</param>
/// <returns></returns>
public static HtmlAgilityPack.HtmlNode GetHtmlNodeFromLink(string url)
{
try{
Uri uri = new Uri(url);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
WebResponse response = request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader read = new StreamReader(stream, Encoding.GetEncoding("gb2312"));
string str = read.ReadToEnd();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.LoadHtml(str);
return html.DocumentNode;
}
catch{return null;}
}
/// <summary>
/// 根据输入的URL地址输出指定XPATH下的节点集合
/// </summary>
/// <param name="url">地址</param>
/// <param name="xPath">过滤地址</param>
/// <param name="imgs">过滤地址</param>
/// <param name="links">过滤地址</param>
/// <param name="title">标题</param>
/// <returns></returns>
public static bool GetGalleryInfo(HtmlAgilityPack.HtmlNode htmlNode,string xPath,ref string[] imgs, ref string[] links,ref string[] title)
{
try
{
HtmlNodeCollection hnc = htmlNode.SelectNodes(xPath);//"//div[@class='slideBannerA homeSlideAD1']"
if (hnc.Count < 1)
return false;
links = new string[hnc.Count];
title = new string[hnc.Count];
imgs = new string[hnc.Count];
int i = 0;
string cateDataRegex = @"background-image:url\((?<image>.+)\)";
Regex re = new Regex(cateDataRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace);
foreach (HtmlNode node in hnc)
{
HtmlAttributeCollection hac = node.Attributes;
links[i] = hac["href"].Value;
imgs[i] = hac["style"] == null ? hac["src2"].Value : re.Match(hac["style"].Value).Groups["image"].Value;
title[i++] = string.IsNullOrEmpty(hac["title"].Value) ? hac["alt"].Value : hac["title"].Value;
}
return true;
}
catch { return false; }
}
//调用
string[] strLink;
string[] strLinAlt;
string[] strImg;
string urls = "http://www.newegg.com.cn";
HtmlAgilityPack.HtmlNode nodes = GetHtmlNodeFromLink(urls);
GetGalleryInfo(nodes, "//div[@class='slideBannerA homeSlideAD1']/div[1]/div[1]/a", out strImg, out strLink,out strLinAlt);
|
|