20 金钱
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
namespace code20
{
class Program
{
static void Main(string[] args)
{
Console.Write("1:抓取,2:处理:");
if (Console.ReadLine() == "1")
{
string url = "http://www.admin5.com/article/20121228/481070.shtml";
string html= gethtml(url, Encoding.Default);
//Regex r = new Regex("(?<=<title>).*?(?=</title>)");//实例化一个正则
//MatchCollection co=r.Matches(html);//匹配所有项返回一个集合
//Console.WriteLine("标题:" + co[0].Value);//通过索引加value获取到内容
//Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
//MatchCollection cocontent = rcontent.Matches(html);
//Console.WriteLine("内容:"+cocontent[0].Value);
string listurl = "http://www.admin5.com/browse/177/";
string listhtml = gethtml(listurl, Encoding.Default);
//http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?
Regex rlist = new Regex("(?<=href=\").*?(?=\")");
MatchCollection co = rlist.Matches(listhtml);
for (int i = 0; i < co.Count; i++)
{
if (co[i].Value.ToString().Contains("article"))
{
Console.WriteLine("http://www.admin5.com/" + co[i].Value);
Console.WriteLine("抓取内容....");
string contenthtml = gethtml("http://www.admin5.com/" + co[i].Value, Encoding.Default);
Regex r = new Regex("(?<=<title>).*?(?=</title>)");//实例化一个正则
MatchCollection cotitlt = r.Matches(contenthtml);//匹配所有项返回一个集合
// Console.WriteLine("标题:" + cotitlt[0].Value);//通过索引加value获取到内容
Regex rcontent = new Regex("<div class=\"content\">[\\s\\S]*?</div>");
MatchCollection cocontent = rcontent.Matches(contenthtml);
// Console.WriteLine("内容:" + cocontent[0].Value);
string title = cotitlt[0].Value;
string content = cocontent[0].Value;
Console.WriteLine("保存数据...");
string appdir = Directory.GetCurrentDirectory();
if (!Directory.Exists(appdir + "\\data"))
{
Directory.CreateDirectory(appdir + "\\data");
}
File.WriteAllText(appdir + "\\data" + "\\" + i + ".txt", title + "\r\n" + content);
Console.WriteLine("保存成功!");
}
}
Console.ReadLine();
}
else
{
string appdir = Directory.GetCurrentDirectory();
string [] files= Directory.GetFiles(appdir+"\\data");//获取data里面所有的文件
foreach(string filename in files)//遍历所有文件名
{
Console.WriteLine(filename);
string html = File.ReadAllText(filename,Encoding.UTF8);//读取内容
string title = html.Remove(html.IndexOf('\n')); //提取标题
string content = html.Replace(title, "");//替换掉内容中的标题,提取出内容
title = title.Remove(title.LastIndexOf('-'));//处理标题
Console.Write(title);
Regex r = new Regex("(?<=href=\").*?(?=\")");
MatchCollection con= r.Matches(content);
for (int i = 0; i < con.Count;i++ )
{
string url = con[i].Value;
string newurl =url.Replace( "www.admin5.com","www.shouyu.com");
content= content.Replace(url,newurl);
}
content = content.Replace("<div class=\"content\">","").Replace("</div>","");
Console.WriteLine("保存...");
File.Delete(filename);
File.WriteAllText(filename, title + "\r\n" + content, Encoding.UTF8);
Console.ReadLine();
}
Console.ReadLine();
}
}
/// <summary>
/// 根据url和编码获取html内容
/// </summary>
/// <param name="url">完整链接带http</param>
/// <param name="enc">编码</param>
/// <returns>字符串,html代码</returns>
public static string gethtml(string url,Encoding enc)
{
WebClient myweb = new WebClient();//实例化一个WebClient连接
Stream stream = myweb.OpenRead(url);//根据指定的url获取流
StreamReader sr = new StreamReader(stream,enc);//从流中用utf8编码实例化一个读取器
string html = sr.ReadToEnd();//从流中读取数据得到字符串
return html;
}
}
}
我来回答