当前位置:编程学习 > C#/ASP.NET >>

求大家讲解一下c#网页采集器的代码

作业要求:利用C#实现一个网络采集器。功能可参照火车采集器,重点功能在于数据采集,包括网络地址的批量采集,页面内容过滤,采集结果保存等。
代码求大家讲解一下这些部分:button2_Click  button1_Click  void getinformation   refine  saveas  geturl  button2_Click  button5_Click
我会把剩余的分全追加了,求大家仔细讲解,我是新手
设计界面截图:
][/img]

主要代码:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Diagnostics;
using System.Text.RegularExpressions;
using System.Collections;
namespace 网络文本挖掘器
{

    public partial class Form1 : Form
    {
        public static string s1="http://www.2345.com";
        public static string filename;
        public static string context;
        public static string filepath="";

        public Form1()
        {
            InitializeComponent();
        }
        private void textBox1_TextChanged(object sender, EventArgs e)
        {
            s1=textBox1.Text.ToString();
        }

        
        private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            
        }
        private void button4_Click(object sender, EventArgs e)
        {
            try
            {
                webBrowser1.Navigate(new Uri(s1));
            }
            catch
            {
                MessageBox.Show("无效的网址!!!你的智商令人捉鸡呀!!!");
                s1 = "";
            }
        }
        private void button1_Click(object sender, EventArgs e)
        {
          
            try
            {  WebClient myWebclient = new WebClient();
                byte[] mydatabuffer = myWebclient.DownloadData(s1);
                string ss= Encoding.UTF8.GetString(mydatabuffer);
                filename = "当前页面";
                saveas(filename, ss);
            }
            catch
            {
                MessageBox.Show("无效的网址!!!你的智商令人捉鸡呀!!!");
                s1 = "";
            }
        }



        private static void getinformation(string url)
        {
            try
            {
                int start1,start2,end1,end2,start3,temp;
                WebClient myWebclient = new WebClient();
                byte[] mydatabuffer = myWebclient.DownloadData(url);
                string ss = Encoding.UTF8.GetString(mydatabuffer);
                start1 = ss.IndexOf(@"CNNVD编号:</td>");
                end1 = ss.IndexOf(@"m", start1, ss.Length - start1);
                start2 = ss.IndexOf(@"cnnvd3_12_24.jpg");
                start3 = ss.IndexOf(@"m", start2, ss.Length - start2);
                temp = start3;
               do
                {
                    end2 = ss.IndexOf(@"<", temp, ss.Length - temp);
                    temp = end2+1;
                }
                while (ss.Substring(end2 + 1, 1) != @"/");
                context=refine(ss.Substring(start3+3,end2-start3-3));
                filename = refine(ss.Substring(end1+3, 16));
            }
            catch
            {
                MessageBox.Show("网址无效!!!智商啊!!!");
            }
            
        }
        private static string refine(string s)
        {
            string sy;
           sy=s.Replace("<br/>","");
           
           sy=sy.Replace(" ", "\n");

           return sy;
                
          
        }
        private static void saveas(string filename, string context)
        {
            if (filepath == "")
            {
                FileInfo file = new FileInfo(@"D:\\" + filename + ".txt");
                StreamWriter sw = file.AppendText();
                sw.Write(context);
                sw.Close();
                sw.Dispose();
            }
            else
            {
                try
                {
                    FileInfo file = new FileInfo(@filepath + filename + ".txt");
                    StreamWriter sw = file.AppendText();
                    sw.Write(context);
                    sw.Close();
                    sw.Dispose();
                }
                catch
                {
                    MessageBox.Show("文件路径输入错误,存入默认路径!自己复制粘贴去");
                }
            }
            

        }
        private static string geturl(string s)
        {
            int i = 0;
            int count = 2;
            string temp = s1;
            char[] c = s.ToCharArray();
            for (i = 0; i < s.Length; i++)
            {
                if (c[i] == '/')
                { count--; }
                 if(count==0)  
                     break;
            }
            for (int j = i; j < s.Length-1; j++)
            {
                temp += c[j];
            }

            return temp;
        }
        private void button2_Click(object sender, EventArgs e)
        {
            try
            {
                System.Net.WebClient client = new WebClient();
                byte[] page = client.DownloadData(s1);
                string content = System.Text.Encoding.UTF8.GetString(page);
                string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
                Regex re = new Regex(regex);
               
                MatchCollection matches = re.Matches(content);
                System.Collections.IEnumerator enu = matches.GetEnumerator();
                this.comboBox1.Items.Clear();
                this.comboBox2.Items.Clear();
                while (enu.MoveNext() && enu.Current != null)
                {
                    Match match = (Match)(enu.Current);
                    this.comboBox1.Items.Add(match.Value);
                    this.comboBox2.Items.Add(match.Value);
                 }
                MessageBox.Show("网址分析完毕,请通过下拉式菜单选取需要批量采集的范围");
            }
            catch
            {
                MessageBox.Show("无效的网址!!!你的智商令人捉鸡呀!!!");
            }
        }
        private void button3_Click(object sender, EventArgs e)
        {
            this.Close();
            Application.Exit();
        }
        private void button5_Click(object sender, EventArgs e)
        {

            if (this.comboBox1.SelectedIndex >this.comboBox2.SelectedIndex||this.comboBox1.SelectedIndex==-1||this.comboBox2.SelectedIndex==-1)
            {
                MessageBox.Show("开头比结束位置还靠后,你是不是傻呀!");
            }
            else
            {
                for (int i = this.comboBox1.SelectedIndex; i <= this.comboBox2.SelectedIndex; i++)
                {
                    getinformation(geturl(this.comboBox1.Items[i].ToString()));
                    saveas(filename, context);
                }
                MessageBox.Show("采集完成,已存入制定文件夹,请查阅");
            }

        }

        private void textBox2_TextChanged(object sender, EventArgs e)
        {
            filepath = textBox2.Text.ToString();
        }

        private void comboBox1_SelectedIndexChanged(object sender, EventArgs e)
        {

        }

        private void comboBox2_SelectedIndexChanged(object sender, EventArgs e)
        {

        }   
    }
}
--------------------编程问答-------------------- --------------------编程问答-------------------- button4_Click 将浏览器转到输入的url
button1_Click 保存当前页面
geturl 提取url,其实就是去掉 http:// (汗) --------------------编程问答-------------------- button3_Click 关闭
button2_Click 提取链接,用了正则表达式
getinformation 从特定网页提取特定数据

总结:代码很烂很烂很烂;而且要么是一个人拼凑粘贴的,要不然是几个人写的,同样的事情,用了不止一种方法实现。写这个代码的人的智商堪比驴子。 --------------------编程问答-------------------- 怎么看都不像我写的,还好,还好 --------------------编程问答-------------------- button2_Click   //提取链接,用了正则表达式
button1_Click  //扒取页面
void getinformation   //应该是匹配页面内容
refine   //过滤特殊字符
saveas   //将拔取的页面存放到本地磁盘  txt
geturl  //提取url
button5_Click  //没什么 基本验证 。 然后调用void getinformation


其实,你看messagebox.show()里的内容 也能分析个八九不离十了!
补充:.NET技术 ,  C#
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,