求大家讲解一下c#网页采集器的代码
作业要求:利用C#实现一个网络采集器。功能可参照火车采集器,重点功能在于数据采集,包括网络地址的批量采集,页面内容过滤,采集结果保存等。代码求大家讲解一下这些部分:button2_Click button1_Click void getinformation refine saveas geturl button2_Click button5_Click
我会把剩余的分全追加了,求大家仔细讲解,我是新手
设计界面截图:
][/img]
主要代码:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Diagnostics;
using System.Text.RegularExpressions;
using System.Collections;
namespace 网络文本挖掘器
{
public partial class Form1 : Form
{
public static string s1="http://www.2345.com";
public static string filename;
public static string context;
public static string filepath="";
public Form1()
{
InitializeComponent();
}
private void textBox1_TextChanged(object sender, EventArgs e)
{
s1=textBox1.Text.ToString();
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
}
private void button4_Click(object sender, EventArgs e)
{
try
{
webBrowser1.Navigate(new Uri(s1));
}
catch
{
MessageBox.Show("无效的网址!!!你的智商令人捉鸡呀!!!");
s1 = "";
}
}
private void button1_Click(object sender, EventArgs e)
{
try
{ WebClient myWebclient = new WebClient();
byte[] mydatabuffer = myWebclient.DownloadData(s1);
string ss= Encoding.UTF8.GetString(mydatabuffer);
filename = "当前页面";
saveas(filename, ss);
}
catch
{
MessageBox.Show("无效的网址!!!你的智商令人捉鸡呀!!!");
s1 = "";
}
}
private static void getinformation(string url)
{
try
{
int start1,start2,end1,end2,start3,temp;
WebClient myWebclient = new WebClient();
byte[] mydatabuffer = myWebclient.DownloadData(url);
string ss = Encoding.UTF8.GetString(mydatabuffer);
start1 = ss.IndexOf(@"CNNVD编号:</td>");
end1 = ss.IndexOf(@"m", start1, ss.Length - start1);
start2 = ss.IndexOf(@"cnnvd3_12_24.jpg");
start3 = ss.IndexOf(@"m", start2, ss.Length - start2);
temp = start3;
do
{
end2 = ss.IndexOf(@"<", temp, ss.Length - temp);
temp = end2+1;
}
while (ss.Substring(end2 + 1, 1) != @"/");
context=refine(ss.Substring(start3+3,end2-start3-3));
filename = refine(ss.Substring(end1+3, 16));
}
catch
{
MessageBox.Show("网址无效!!!智商啊!!!");
}
}
private static string refine(string s)
{
string sy;
sy=s.Replace("<br/>","");
sy=sy.Replace(" ", "\n");
return sy;
}
private static void saveas(string filename, string context)
{
if (filepath == "")
{
FileInfo file = new FileInfo(@"D:\\" + filename + ".txt");
StreamWriter sw = file.AppendText();
sw.Write(context);
sw.Close();
sw.Dispose();
}
else
{
try
{
FileInfo file = new FileInfo(@filepath + filename + ".txt");
StreamWriter sw = file.AppendText();
sw.Write(context);
sw.Close();
sw.Dispose();
}
catch
{
MessageBox.Show("文件路径输入错误,存入默认路径!自己复制粘贴去");
}
}
}
private static string geturl(string s)
{
int i = 0;
int count = 2;
string temp = s1;
char[] c = s.ToCharArray();
for (i = 0; i < s.Length; i++)
{
if (c[i] == '/')
{ count--; }
if(count==0)
break;
}
for (int j = i; j < s.Length-1; j++)
{
temp += c[j];
}
return temp;
}
private void button2_Click(object sender, EventArgs e)
{
try
{
System.Net.WebClient client = new WebClient();
byte[] page = client.DownloadData(s1);
string content = System.Text.Encoding.UTF8.GetString(page);
string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
Regex re = new Regex(regex);
MatchCollection matches = re.Matches(content);
System.Collections.IEnumerator enu = matches.GetEnumerator();
this.comboBox1.Items.Clear();
this.comboBox2.Items.Clear();
while (enu.MoveNext() && enu.Current != null)
{
Match match = (Match)(enu.Current);
this.comboBox1.Items.Add(match.Value);
this.comboBox2.Items.Add(match.Value);
}
MessageBox.Show("网址分析完毕,请通过下拉式菜单选取需要批量采集的范围");
}
catch
{
MessageBox.Show("无效的网址!!!你的智商令人捉鸡呀!!!");
}
}
private void button3_Click(object sender, EventArgs e)
{
this.Close();
Application.Exit();
}
private void button5_Click(object sender, EventArgs e)
{
if (this.comboBox1.SelectedIndex >this.comboBox2.SelectedIndex||this.comboBox1.SelectedIndex==-1||this.comboBox2.SelectedIndex==-1)
{
MessageBox.Show("开头比结束位置还靠后,你是不是傻呀!");
}
else
{
for (int i = this.comboBox1.SelectedIndex; i <= this.comboBox2.SelectedIndex; i++)
{
getinformation(geturl(this.comboBox1.Items[i].ToString()));
saveas(filename, context);
}
MessageBox.Show("采集完成,已存入制定文件夹,请查阅");
}
}
private void textBox2_TextChanged(object sender, EventArgs e)
{
filepath = textBox2.Text.ToString();
}
private void comboBox1_SelectedIndexChanged(object sender, EventArgs e)
{
}
private void comboBox2_SelectedIndexChanged(object sender, EventArgs e)
{
}
}
}
--------------------编程问答-------------------- --------------------编程问答-------------------- button4_Click 将浏览器转到输入的url
button1_Click 保存当前页面
geturl 提取url,其实就是去掉 http:// (汗) --------------------编程问答-------------------- button3_Click 关闭
button2_Click 提取链接,用了正则表达式
getinformation 从特定网页提取特定数据
总结:代码很烂很烂很烂;而且要么是一个人拼凑粘贴的,要不然是几个人写的,同样的事情,用了不止一种方法实现。写这个代码的人的智商堪比驴子。 --------------------编程问答-------------------- 怎么看都不像我写的,还好,还好 --------------------编程问答-------------------- button2_Click //提取链接,用了正则表达式
button1_Click //扒取页面
void getinformation //应该是匹配页面内容
refine //过滤特殊字符
saveas //将拔取的页面存放到本地磁盘 txt
geturl //提取url
button5_Click //没什么 基本验证 。 然后调用void getinformation
其实,你看messagebox.show()里的内容 也能分析个八九不离十了!
补充:.NET技术 , C#