当前位置:编程学习 > VB >>

利用VB6.0根据特征码抓取网站信息

文/图 红色银狐
好久没有写文章了,我的朋友突然有一天想起了一个问题并对我说,红色银狐你看看能不能把网页中我想要的标题和文章内容抓取出来呢?带着好奇的心理,我试着和朋友沟通了一番,结果是肯定的,用了三天时间就大体完成了朋友想要的功能,如果再把细节做好点,我想还是满不错的。
下面我们进入正题。首先做的不是编写代码,而是做分析,画出流程图。这里简单介绍一下。
第一:输入关键字,利用百度搜索,或者GOOGLE搜索进行搜索;
第二:分析搜索的超链接,并分析有效链接和无效链接;
第三:利用Internet Transfer控件抓取网页内容;
第四:根据设置特征码抓取你想要的信息;
第五:将获取的信息进行处理,并写入数据库。
有了上面的流程,我们很快就可以设计出界面来,不过我的界面设置的很难看,也就是能用而已,希望读者看后自己再美化一下,如图1所示。

图1
 
接下来我们要做的就是根据关键字的设置,搜索所有可能的超链接,实现代码如下。
 
获取网页超连接的过程
Private Sub GetLink()
Dim AllItem As Object
Dim TagName As String
Dim Counts, i As Integer
Set AllItem = WebBrowser1.document.All
Counts = AllItem.length
ProgressBar1.Max = Counts
For i = 0 To Counts - 1 Step 1
DoEvents
DoEvents
DoEvents
If BooleStop = True Then Exit For
TagName = LCase(AllItem.Item(i).TagName)
If TagName = "img" Or TagName = "a" Then
过滤带图片和其他格式的地址
If Left(LCase(AllItem.Item(i).href), 7) = "mailto:" Then NValidCounts = NValidCounts + 1: GoTo Next_Loop
If Len(LCase(AllItem.Item(i).href)) = 0 Then NValidCounts = NValidCounts + 1: GoTo Next_Loop
If Left(LCase(AllItem.Item(i).href), 11) = "javascript:" Then NValidCounts = NValidCounts + 1: GoTo Next_Loop
……省略,随文详见源代码……
If Right(LCase(AllItem.Item(i).href), 4) = ".mp3" Then NValidCounts = NValidCounts + 1: GoTo Next_Loop
ltUrl.AddItem LCase(AllItem.Item(i).href)
ValidCounts = ValidCounts + 1
End If
lbNValidUrl = "无效链接个数" & NValidCounts & "个"
lbValidUrl = "有效链接个数" & ValidCounts & "个"
ProgressBar1.Value = i + 1
Next_Loop:
Next
ProgressBar1.Value = 0
Set AllItem = Nothing
End Sub

这里要注意,这段代码不是最好和最优化的,不过大家可以把无效的超链接扩展名放到一个数组中,在判断的时候不用If和Else、End If,而用Select Case、End Select,这样即可提高效率和速度,因为朋友急用,所以我只是考虑了能否做出,没有考虑软件优化。在得到超链接后,我们还必须除去那些无效链接,否则垃圾东西会太多。里面的GOTO语句是执行跳转的,它是区分有效和无效链接之用的,如果无效就跳转并且无效自动加一。获取超链接要用到Webbrowse控件。
   接下来便是抓取每个网页的内容了,那么Internet Transfer控件的OpenURL便是它的英雄用武之处了,它可以将获取的网页源码放到临时变量中。如图2所示,是我抓取的所有有效的超链接,这里是用百度知道抓取的,所以还有下一页和上一页,如果有的话就尽可能地翻到尾页,那样所有数据一个都不会漏掉。程序实现的关键是如何根据特征码抓取想要的标题和文章。实现代码如下。

图2
 
数据小偷开始
Private Sub ThiefData()
Dim i As Long, ErrNumber As Long, k As Long
……省略,随文详见源代码……
Dim Stm As TextStream
Url列表为空退出分析
UrlCount = ltUrl.ListCount
If UrlCount = 0 Then
Exit Sub
End If
On Error Resume Next
sTzmStart = Replace(Trim(tzmStart.Text), vbCrLf, "", 1, -1)
sTzmEnd = Replace(Trim(tzmEnd.Text), vbCrLf, "", 1, -1)
sRepStr = Replace(Trim(RepStr.Text), vbCrLf, "", 1, -1, vbTextCompare)
sRepStrA = Replace(Trim(txtReplace.Text), vbCrLf, "", 1, -1)
strRepStr = Split(sRepStr, "|", -1, vbTextCompare)
strRepStrA = Split(sRepStrA, "|", -1, vbTextCompare)
scmbType = cmbType.Text
如果是去除开始特征码
If RevStar.Value = 1 Then
sTempStar = sTzmStart
Else
sTempStar = "RedSilverFox"
End If
If RevEnd.Value = 1 Then
sTempEnd = ""
Else
sTempEnd = sTzmEnd
End If
ii = 1
ErrNumber = 0
strPath = App.Path & "Temp"
lbHtmlCode.Caption = "开始获取网页源码,请等待..."
ProgressBar1.Max = UrlCount
Inet1.RequestTimeout = 120
Set Fso = CreateObject("Scripting.FileSystemObject")
For i = 0 To UrlCount - 1
sUrl = ltUrl.List(i)
DoEvents
strHtml = Inet1.OpenURL(sUrl, 0)
strHtml = Replace(strHtml, Chr(9), "", 1, -1, vbTextCompare)
Inet1.Cancel
ProgressBar1.Value = i + 1
lbHtmlCode.Caption = "将获取网页源码生成静态页面,请等待..."
If Len(strHtml) = 0 Then
strHtml = "空值"
End If
If Fso.FolderExists(strPath) = False Then
Fso.CreateFolder (strPath)
End If
Set Stm = Fso.CreateTextFile(strPath & "" & i & ".htm", True, False)
Stm.WriteLine (strHtml)
Stm.Close
Set Stm = Nothing
Next
ProgressBar1.Value = 0
lbHtmlCode.Caption = "静态页面生成完毕。根据特征码开始分析数据..."
Set Conn = CreateObject("ADODB.Connection")
Conn.open "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=" & App.Path & "ThiefData.mdb" & ";Persist Security Info=False"
Conn.open "Provider=SQLOLEDB.1;User Id=" & UserName & ";Password=" & UserPass & ";Data Source=" & ServerNm & ";Initial Catalog=" & DataName & ";Persist Security Info=False"
Set Rs = CreateObject("ADODB.Recordset")
For i = 0 To UrlCount - 1
DoEvents
Set Stm = Fso.OpenTextFile(strPath & "" & i & ".htm", ForReading, False)
strHtml = Stm.ReadAll
sTitle = HtmlTitle(strHtml, "<title>", "</title>")
sContent = HtmlBodyA(strHtml, sTzmStart, sTzmEnd)
sContent = Replace(sContent, sTempStar, "", 1, -1, vbTextCompare)
sContent = sContent
If Len(sTitle) > 0 And Len(sContent) > 0 Then
lbHtmlCode.Caption = "找到" & ii & "个网页标题和找到" & ii & "个网页内容!"
sTitle = Replace(LCase(sTitle), "<title>", "", 1, -1, vbTextCompare)
sTitle = Replace(sTitle, "</title>", "", 1, -1, vbTextCompare)
sTitle = Trim(sTitle)
去掉标题和内容中相关部分
For k = LBound(strRepStr) To UBound(strRepStr)
sTitle = Replace(sTitle, strRepStr(k), strRepStrA(k), 1, -1, vbTextCompare)
sContent = Replace(sContent, strRepStr(k), strRepStrA(k), 1, -1, vbTextCompare)
Next
txtTitle.Text = sTitle
txtContent.Text = sContent & sTempEnd
On Error Resume Next
Rs.open "SELECT NewsTitle,NewsContent,NewsType FROM NewsArticle WHERE NewsID IS NULL", Conn, 1, 2
Rs.AddNew
Rs("NewsTitle") = sTitle
Rs("NewsContent") = sContent & sTempEnd
Rs("NewsType") = scmbType
Rs.Update
If Err.Number <> 0 Then
ErrNumber = ErrNumber + 1
Call MsgBox(Err.Description, vbCritical, TitleInfo)
End If
ii = ii + 1
Rs.Close
End If
Stm.Close
Set Stm = Nothing
ProgressBar1.Value = i + 1
DoEvents
Next
On Error GoTo 0
lbHtmlCode.Caption = (ii - ErrNumber - 1) & "条数据成功添加到数据库!" & ErrNumber & "条数据添加失败!"
Set Rs = Nothing
Conn.Close
Set Conn = Nothing
ProgressBar1.Value = 0
BooleThief = False
cmdSearch.Enabled = True
FramSet.Enabled = True
FramResult.Enabled = True
End Sub

简单举个例子,比如你想要网页中的“<table>……</table>”之间的内容,就要保证这个标记是独一无二的,否则它会抓错数据,得到不想要的数据,这里一共有两种方法来抓取你指定的数据,实现代码如下。

启用正则表达式来获取TITLE数据或指定特征码之间的数据
Private Function HtmlTitle(ByVal strCode As String, ByVal startStr As String, ByVal overStr As String)
Dim Reg As RegExp
Dim Matches As Match
Dim colMatches As MatchCollection
设置配置对象
Set Reg = New RegExp
忽略大小写
Reg.IgnoreCase = True
设置为全文搜索
Reg.Global = True
正则表达式
Reg.Pattern = startStr & ".+?" & overStr
开始执行配置
Set colMatches = Reg.Execute(strCode)
HtmlTitle = ""
For Each Matches In colMatches
循环匹配
HtmlTitle = HtmlTitle & Matches.Value
Next
Set colMatches = Nothing
Set

补充:软件开发 , Vb ,
CopyRight © 2012 站长网 编程知识问答 www.zzzyk.com All Rights Reserved
部份技术文章来自网络,