- 积分
- 40165
- 好友
- 记录
- 主题
- 帖子
- 听众
- 收听
|
C# HTMLHelper类对Html源码处理教程与源码下载
导读部分
-------------------------------------------------------------------------------------------------------------
C#基类|C#自定义类|C#帮助类--系列导航文章
http://www.sufeinet.com/thread-655-1-1.html
源码下载,请到基库中直接查找http://www.sufeinet.com/thread-655-1-1.html
主要功能有[C#] 纯文本查看 复制代码 1.获取HTML源代码
2.获取Html源代码的字符流
3.清理所有的Html标记,标签
4.匹配页面的链接,获取Html中所有的A链接
5.匹配页面的图片地址 ,获取Html中所有的图片地址
6.抓取远程页面内容
7.压缩HTML输出
8.过滤指定HTML标签
9.加载文件块
10.加载CSS样式文件
11.加载JavaScript脚本文件
如下图片
[size=0.83em]QQ截图20140121085054.jpg (101.69 KB, 下载次数: 0)
下载附件
[color=rgb(153, 153, 153) !important]昨天 08:50 上传
预览源码
代码如下
[C#] 纯文本查看 复制代码 /// <summary>
/// 类说明:HTMLHelper
/// 编 码 人:苏飞
/// 联系方式:361983679
/// 更新网站:[url=http://www.sufeinet.com/thread-655-1-1.html]http://www.sufeinet.com/thread-655-1-1.html[/url]
/// </summary>
using System;
using System.Text;
using System.Net;
using System.IO;
using System.Threading;
using System.Text.RegularExpressions;
namespace DotNet.Utilities
{
public class HTMLHelper
{
#region 私有字段
private static CookieContainer cc = new CookieContainer();
private static string contentType = "application/x-www-form-urlencoded";
private static string accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/x-silverlight, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-ms-application, application/x-ms-xbap, application/vnd.ms-xpsdocument, application/xaml+xml, application/x-silverlight-2-b1, */*";
private static string userAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
private static Encoding encoding = Encoding.GetEncoding("utf-8");
private static int delay = 1000;
private static int maxTry = 300;
private static int currentTry = 0;
#endregion
#region 公有属性
/// <summary>
/// Cookie
/// </summary>
public static CookieContainer CookieContainer
{
get
{
return cc;
}
}
/// <summary>
/// 语言
/// </summary>
public static Encoding Encoding
{
get
{
return encoding;
}
set
{
encoding = value;
}
}
public static int NetworkDelay
{
get
{
Random r = new Random();
return (r.Next(delay, delay * 2));
}
set
{
delay = value;
}
}
public static int MaxTry
{
get
{
return maxTry;
}
set
{
maxTry = value;
}
}
#endregion
#region 获取HTML
/// <summary>
/// 获取HTML
/// </summary>
/// <param name="url">地址</param>
/// <param name="postData">post 提交的字符串</param>
/// <param name="isPost">是否是post</param>
/// <param name="cookieContainer">CookieContainer</param>
public static string GetHtml(string url, string postData, bool isPost, CookieContainer cookieContainer)
{
if (string.IsNullOrEmpty(postData)) return GetHtml(url, cookieContainer);
Thread.Sleep(NetworkDelay);
currentTry++;
HttpWebRequest httpWebRequest = null;
HttpWebResponse httpWebResponse = null;
try
{
byte[] byteRequest = Encoding.Default.GetBytes(postData);
httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
httpWebRequest.CookieContainer = cookieContainer;
httpWebRequest.ContentType = contentType;
httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
httpWebRequest.Referer = url;
httpWebRequest.Accept = accept;
httpWebRequest.UserAgent = userAgent;
httpWebRequest.Method = isPost ? "POST" : "GET";
httpWebRequest.ContentLength = byteRequest.Length;
Stream stream = httpWebRequest.GetRequestStream();
stream.Write(byteRequest, 0, byteRequest.Length);
stream.Close();
httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
Stream responseStream = httpWebResponse.GetResponseStream();
StreamReader streamReader = new StreamReader(responseStream, encoding);
string html = streamReader.ReadToEnd();
streamReader.Close();
responseStream.Close();
currentTry = 0;
httpWebRequest.Abort();
httpWebResponse.Close();
return html;
}
catch (Exception e)
{
if (currentTry <= maxTry) GetHtml(url, postData, isPost, cookieContainer);
currentTry--;
if (httpWebRequest != null) httpWebRequest.Abort();
if (httpWebResponse != null) httpWebResponse.Close();
return string.Empty;
}
}
/// <summary>
/// 获取HTML
/// </summary>
/// <param name="url">地址</param>
/// <param name="cookieContainer">CookieContainer</param>
public static string GetHtml(string url, CookieContainer cookieContainer)
{
Thread.Sleep(NetworkDelay);
currentTry++;
HttpWebRequest httpWebRequest = null;
HttpWebResponse httpWebResponse = null;
try
{
httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
httpWebRequest.CookieContainer = cookieContainer;
httpWebRequest.ContentType = contentType;
httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
httpWebRequest.Referer = url;
httpWebRequest.Accept = accept;
httpWebRequest.UserAgent = userAgent;
httpWebRequest.Method = "GET";
httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
Stream responseStream = httpWebResponse.GetResponseStream();
StreamReader streamReader = new StreamReader(responseStream, encoding);
string html = streamReader.ReadToEnd();
streamReader.Close();
responseStream.Close();
currentTry--;
httpWebRequest.Abort();
httpWebResponse.Close();
return html;
}
catch (Exception e)
{
if (currentTry <= maxTry) GetHtml(url, cookieContainer);
currentTry--;
if (httpWebRequest != null) httpWebRequest.Abort();
if (httpWebResponse != null) httpWebResponse.Close();
return string.Empty;
}
}
#endregion
#region 获取字符流
/// <summary>
/// 获取字符流
/// </summary>
//---------------------------------------------------------------------------------------------------------------
// 示例:
// System.Net.CookieContainer cookie = new System.Net.CookieContainer();
// Stream s = HttpHelper.GetStream("http://ptlogin2.qq.com/getimage?aid=15000102&0.43878429697395826", cookie);
// picVerify.Image = Image.FromStream(s);
//---------------------------------------------------------------------------------------------------------------
/// <param name="url">地址</param>
/// <param name="cookieContainer">cookieContainer</param>
public static Stream GetStream(string url, CookieContainer cookieContainer)
{
currentTry++;
HttpWebRequest httpWebRequest = null;
HttpWebResponse httpWebResponse = null;
try
{
httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(url);
httpWebRequest.CookieContainer = cookieContainer;
httpWebRequest.ContentType = contentType;
httpWebRequest.ServicePoint.ConnectionLimit = maxTry;
httpWebRequest.Referer = url;
httpWebRequest.Accept = accept;
httpWebRequest.UserAgent = userAgent;
httpWebRequest.Method = "GET";
httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
Stream responseStream = httpWebResponse.GetResponseStream();
currentTry--;
return responseStream;
}
catch (Exception e)
{
if (currentTry <= maxTry)
{
GetHtml(url, cookieContainer);
}
currentTry--;
if (httpWebRequest != null)
{
httpWebRequest.Abort();
} if (httpWebResponse != null)
{
httpWebResponse.Close();
}
return null;
}
}
#endregion
#region 清除HTML标记
///<summary>
///清除HTML标记
///</summary>
///<param name="NoHTML">包括HTML的源码</param>
///<returns>已经去除后的文字</returns>
public static string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
Htmlstring = regex.Replace(Htmlstring, "");
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
Htmlstring.Replace("<", "");
Htmlstring.Replace(">", "");
Htmlstring.Replace("\r\n", "");
return Htmlstring;
}
#endregion
#region 匹配页面的链接
/// <summary>
/// 获取页面的链接正则
/// </summary>
public string GetHref(string HtmlCode)
{
string MatchVale = "";
string Reg = @"(h|H)(r|R)(e|E)(f|F) *= *('|"")?((\w|\\|\/|\.|:|-|_)+)[\S]*";
foreach (Match m in Regex.Matches(HtmlCode, Reg))
{
MatchVale += (m.Value).ToLower().Replace("href=", "").Trim() + "|";
}
return MatchVale;
}
#endregion
#region 匹配页面的图片地址
/// <summary>
/// 匹配页面的图片地址
/// </summary>
/// <param name="imgHttp">要补充的http://路径信息</param>
public string GetImgSrc(string HtmlCode, string imgHttp)
{
string MatchVale = "";
string Reg = @"<img.+?>";
foreach (Match m in Regex.Matches(HtmlCode.ToLower(), Reg))
{
MatchVale += GetImg((m.Value).ToLower().Trim(), imgHttp) + "|";
}
return MatchVale;
}
/// <summary>
/// 匹配<img src="" />中的图片路径实际链接
/// </summary>
/// <param name="ImgString"><img src="" />字符串</param>
public string GetImg(string ImgString, string imgHttp)
{
string MatchVale = "";
string Reg = @"src=.+\.(bmp|jpg|gif|png|)";
foreach (Match m in Regex.Matches(ImgString.ToLower(), Reg))
{
MatchVale += (m.Value).ToLower().Trim().Replace("src=", "");
}
if (MatchVale.IndexOf(".net") != -1 || MatchVale.IndexOf(".com") != -1 || MatchVale.IndexOf(".org") != -1 || MatchVale.IndexOf(".cn") != -1 || MatchVale.IndexOf(".cc") != -1 || MatchVale.IndexOf(".info") != -1 || MatchVale.IndexOf(".biz") != -1 || MatchVale.IndexOf(".tv") != -1)
return (MatchVale);
else
return (imgHttp + MatchVale);
}
#endregion
#region 抓取远程页面内容
/// <summary>
/// 以GET方式抓取远程页面内容
/// </summary>
public static string Get_Http(string tUrl)
{
string strResult;
try
{
HttpWebRequest hwr = (HttpWebRequest)HttpWebRequest.Create(tUrl);
hwr.Timeout = 19600;
HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
Stream myStream = hwrs.GetResponseStream();
StreamReader sr = new StreamReader(myStream, Encoding.Default);
StringBuilder sb = new StringBuilder();
while (-1 != sr.Peek())
{
sb.Append(sr.ReadLine() + "\r\n");
}
strResult = sb.ToString();
hwrs.Close();
}
catch (Exception ee)
{
strResult = ee.Message;
}
return strResult;
}
/// <summary>
/// 以POST方式抓取远程页面内容
/// </summary>
/// <param name="postData">参数列表</param>
public static string Post_Http(string url, string postData, string encodeType)
{
string strResult = null;
try
{
Encoding encoding = Encoding.GetEncoding(encodeType);
byte[] POST = encoding.GetBytes(postData);
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url);
myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = POST.Length;
Stream newStream = myRequest.GetRequestStream();
newStream.Write(POST, 0, POST.Length); //设置POST
newStream.Close();
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.Default);
strResult = reader.ReadToEnd();
}
catch (Exception ex)
{
strResult = ex.Message;
}
return strResult;
}
#endregion
#region 压缩HTML输出
/// <summary>
/// 压缩HTML输出
/// </summary>
public static string ZipHtml(string Html)
{
Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
Html = Regex.Replace(Html, @"\r\n\s*", "");
Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
return Html;
}
#endregion
#region 过滤指定HTML标签
/// <summary>
/// 过滤指定HTML标签
/// </summary>
/// <param name="s_TextStr">要过滤的字符</param>
/// <param name="html_Str">a img p div</param>
public static string DelHtml(string s_TextStr, string html_Str)
{
string rStr = "";
if (!string.IsNullOrEmpty(s_TextStr))
{
rStr = Regex.Replace(s_TextStr, "<" + html_Str + "[^>]*>", "", RegexOptions.IgnoreCase);
rStr = Regex.Replace(rStr, "</" + html_Str + ">", "", RegexOptions.IgnoreCase);
}
return rStr;
}
#endregion
#region 加载文件块
/// <summary>
/// 加载文件块
/// </summary>
public static string File(string Path, System.Web.UI.Page p)
{
return @p.ResolveUrl(Path);
}
#endregion
#region 加载CSS样式文件
/// <summary>
/// 加载CSS样式文件
/// </summary>
public static string CSS(string cssPath, System.Web.UI.Page p)
{
return @"<link href=""" + p.ResolveUrl(cssPath) + @""" rel=""stylesheet"" type=""text/css"" />" + "\r\n";
}
#endregion
#region 加载JavaScript脚本文件
/// <summary>
/// 加载javascript脚本文件
/// </summary>
public static string JS(string jsPath, System.Web.UI.Page p)
{
return @"<script type=""text/javascript"" src=""" + p.ResolveUrl(jsPath) + @"""></script>" + "\r\n";
}
#endregion
}
}
例子
比如我们要获取一个网页的源代码
[C#] 纯文本查看 复制代码 HttpHelper http = new HttpHelper();
HttpItem item = new HttpItem()
{
URL = "http://www.sufeinet.com",//URL 必需项
};
HttpResult result = http.GetHtml(item);
//这里就是获取到的压缩后的Html了
string html = HTMLHelper.ZipHtml(result.Html);
其他的方法调用和这个都一样,大家只要传不同的参数就是了。
Ok就到这里有问题跟帖吧
|
|