|
最近尝试做一个爬虫软件,根据关键字进行抓取页面。其他方面还好,查了不少资料都还弄得比较像样。但是我对多线程这块实在是拙计... 看了书也还是调试不出来,很是着急啊... 下面我给出一个我用来下载页面的类的原型,希望有热心的前辈能根据这个类写一个多线程下载的示例,能注释一下就更好了。感激不尽!
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;
namespace 多线程爬虫_test01
{
public class Crawler
{
//基地址
public static Uri baseUri;
public static string baseHost = string.Empty;
/// <summary>
/// 工作队列
/// </summary>
public static Queue<string> todo = new Queue<string>();
//已访问的队列
public static HashSet<string> visited = new HashSet<string>();
public Crawler(string url)
{
baseUri = new Uri(url);
//基域
baseHost = baseUri.Host.Substring(baseUri.Host.IndexOf('.'));
//抓取首地址入队
todo.Enqueue(url);
}
public void DownLoad()
{
while (todo.Count > 0)
{
{
var currentUrl = todo.Dequeue();
visited.Add(currentUrl);
try
{
var request = WebRequest.Create(currentUrl) as HttpWebRequest;
var response = request.GetResponse() as HttpWebResponse;
var sr = new StreamReader(response.GetResponseStream());
RefineUrl(sr.ReadToEnd());
}
catch (WebException e)
{
System.Console.WriteLine("下载失败,错误:" + e);
}
catch (IOException e)
{
System.Console.WriteLine("下载失败,错误:" + e);
}
//提取url,将未访问的放入todo表中
}
}
}
/// <summary>
/// 提取Url
/// </summary>
/// <param name="html"></param
public void RefineUrl(string html)
{
// Monitor.Enter(this);
{
Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(??!</?a\b).)*)</a>");
MatchCollection mc = reg.Matches(html);
foreach (Match m in mc)
{
var url = m.Groups["url"].Value;
if (url == "#")
continue;
//相对路径转换为绝对路径
Uri uri = new Uri(baseUri, url);
//剔除外网链接(获取顶级域名)
if (!uri.Host.EndsWith(baseHost))
continue;
if (uri.Host.StartsWith("ftp"))
continue;
if (!visited.Contains(uri.ToString()) && !todo.Contains(uri.ToString()))
{
todo.Enqueue(uri.ToString());
}
}
}
}
}
} |
|