300字范文,内容丰富有趣,生活中的好帮手!
300字范文 > htmlparser 获取html 根据htmlparser写的一个提取页面纯文本的C#程序

htmlparser 获取html 根据htmlparser写的一个提取页面纯文本的C#程序

时间:2022-05-29 02:34:21

相关推荐

htmlparser 获取html 根据htmlparser写的一个提取页面纯文本的C#程序

c#的网页内容提取程序,在vs下调试完全通过,且无乱码现象

using System;

using System.Collections.Generic;

using ponentModel;

using System.Data;

using System.Drawing;

using System.Linq;

using System.Text;

using System.Windows.Forms;

using System.Threading;

using System.IO;

using ;

using Winista.Text.HtmlParser;

using Winista.Text.HtmlParser.Lex;

using Winista.Text.HtmlParser.Nodes;

using Winista.Text.HtmlParser.Util;

using Winista.Text.HtmlParser.Visitors;

using Winista.Text.HtmlParser.Filters;

using Winista.Text.HtmlParser.Tags;

using Winista.Text.HtmlParser.Http;

using System.Diagnostics;

using System.Text.RegularExpressions;

namespace testhtml

{

public partial class Form1 : Form

{

public Form1()

{

InitializeComponent();

}

private void button1_Click(object sender, EventArgs e)

{

htmlText = getData(textBox1.Text);

htmlText = delJsStyle(htmlText);

string xx = toText(htmlText);

htmlText = delspace(xx);

textBox2.Text = htmlText;

}

string htmlText = "";

private string getData(string WebUrl)

{

try

{

WebClient myWebClient = new WebClient();

myWebClient.Encoding = System.Text.Encoding.Default;

htmlText = myWebClient.DownloadString(WebUrl);

int index = htmlText.IndexOf("charset");

string tempcode = htmlText.Substring(index, 15);

if (tempcode.Contains("gbk") || tempcode.Contains("gb2312") || tempcode.Contains("GBK") || tempcode.Contains("GB2312"))

myWebClient.Encoding = System.Text.Encoding.GetEncoding("gb2312");

else

myWebClient.Encoding = System.Text.Encoding.UTF8;

htmlText = myWebClient.DownloadString(WebUrl);

}

catch (Exception ex)

{

MessageBox.Show(ex.Message + "ee");

}

if (htmlText.Trim() == "")

htmlText = "获取页面失败!";

return htmlText;

}

private string toText(string str)

{

string strParser = "";

Lexer lexer1 = new Lexer(str);

Parser parser1 = new Parser(lexer1);

// Parser parser1 = Parser.CreateParser(textBox1.Text,"utf-8");

NodeFilter body = new TagNameFilter("BODY");

NodeList nodelistoftitle = parser1.Parse(body);

TextExtractingVisitor visitor = new TextExtractingVisitor();

nodelistoftitle.VisitAllNodesWith(visitor);

strParser = visitor.ExtractedText.ToString();

return strParser;

}

private void Form1_Load_1(object sender, EventArgs e)

{

textBox1.Text = "";

}

private void button2_Click(object sender, EventArgs e)

{

}

public static string delJsStyle(string str)

{

string str1 = new Regex(@"(?m)]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(str, "");

// str1 = new Regex(@"/s+", RegexOptions.Multiline).Replace(str1, " ");

return new Regex(@"(?m)]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(str1, "");

}

public static string delspace(string str)

{

string str1 = new Regex(@"/s+", RegexOptions.Multiline).Replace(str, " ");

return str1;

}

}

}

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。