一般來說,使用線程有兩種模式, 一種是創建線程要執行的函數, 把這個函數傳遞進Thread對象里,讓它來執行. 另一種是直接從Thread繼承,創建一個新的class,把線程執行的代碼放到這個新的class里。
實現多線程網頁爬蟲,采用了多線程和鎖機制,實現了廣度優先算法的網頁爬蟲。
先給大家簡單介紹下我的實現思路:
對于一個網絡爬蟲,如果要按廣度遍歷的方式下載,它是這樣的:
1.從給定的入口網址把第一個網頁下載下來
2.從第一個網頁中提取出所有新的網頁地址,放入下載列表中
3.按下載列表中的地址,下載所有新的網頁
4.從所有新的網頁中找出沒有下載過的網頁地址,更新下載列表
5.重復3、4兩步,直到更新后的下載列表為空表時停止
python代碼如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
#!/usr/bin/env python #coding=utf-8 import threading import urllib import re import time g_mutex = threading.Condition() g_pages = [] #從中解析所有url鏈接 g_queueURL = [] #等待爬取的url鏈接列表 g_existURL = [] #已經爬取過的url鏈接列表 g_failedURL = [] #下載失敗的url鏈接列表 g_totalcount = 0 #下載過的頁面數 class Crawler: def __init__( self ,crawlername,url,threadnum): self .crawlername = crawlername self .url = url self .threadnum = threadnum self .threadpool = [] self .logfile = file ( "log.txt" , 'w' ) def craw( self ): global g_queueURL g_queueURL.append(url) depth = 0 print self .crawlername + " 啟動..." while ( len (g_queueURL)! = 0 ): depth + = 1 print 'Searching depth ' ,depth, '...\n\n' self .logfile.write( "URL:" + g_queueURL[ 0 ] + "........" ) self .downloadAll() self .updateQueueURL() content = '\n>>>Depth ' + str (depth) + ':\n' self .logfile.write(content) i = 0 while i< len (g_queueURL): content = str (g_totalcount + i) + '->' + g_queueURL[i] + '\n' self .logfile.write(content) i + = 1 def downloadAll( self ): global g_queueURL global g_totalcount i = 0 while i< len (g_queueURL): j = 0 while j< self .threadnum and i + j < len (g_queueURL): g_totalcount + = 1 threadresult = self .download(g_queueURL[i + j], str (g_totalcount) + '.html' ,j) if threadresult! = None : print 'Thread started:' ,i + j, '--File number =' ,g_totalcount j + = 1 i + = j for thread in self .threadpool: thread.join( 30 ) threadpool = [] g_queueURL = [] def download( self ,url,filename,tid): crawthread = CrawlerThread(url,filename,tid) self .threadpool.append(crawthread) crawthread.start() def updateQueueURL( self ): global g_queueURL global g_existURL newUrlList = [] for content in g_pages: newUrlList + = self .getUrl(content) g_queueURL = list ( set (newUrlList) - set (g_existURL)) def getUrl( self ,content): regob = re. compile (reg,re.DOTALL) urllist = regob.findall(content) return urllist class CrawlerThread(threading.Thread): def __init__( self ,url,filename,tid): threading.Thread.__init__( self ) self .url = url self .filename = filename self .tid = tid def run( self ): global g_mutex global g_failedURL global g_queueURL try : page = urllib.urlopen( self .url) html = page.read() fout = file ( self .filename, 'w' ) fout.write(html) fout.close() except Exception,e: g_mutex.acquire() g_existURL.append( self .url) g_failedURL.append( self .url) g_mutex.release() print 'Failed downloading and saving' , self .url print e return None g_mutex.acquire() g_pages.append(html) g_existURL.append( self .url) g_mutex.release() if __name__ = = "__main__" : url = raw_input ( "請輸入url入口:\n" ) threadnum = int ( raw_input ( "設置線程數:" )) crawlername = "小小爬蟲" crawler = Crawler(crawlername,url,threadnum) crawler.craw() |
以上代碼就是給大家分享的基python實現多線程網頁爬蟲,希望大家喜歡。