php結合curl實現多線程抓取
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
<?php /* curl 多線程抓取 */ /** * curl 多線程 * * @param array $array 并行網址 * @param int $timeout 超時時間 * @return array */ function Curl_http( $array , $timeout ){ $res = array (); $mh = curl_multi_init(); //創建多個curl語柄 $startime = getmicrotime(); foreach ( $array as $k => $url ){ $conn [ $k ]=curl_init( $url ); curl_setopt( $conn [ $k ], CURLOPT_TIMEOUT, $timeout ); //設置超時時間 curl_setopt( $conn [ $k ], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)' ); curl_setopt( $conn [ $k ], CURLOPT_MAXREDIRS, 7); //HTTp定向級別 curl_setopt( $conn [ $k ], CURLOPT_HEADER, 0); //這里不要header,加塊效率 curl_setopt( $conn [ $k ], CURLOPT_FOLLOWLOCATION, 1); // 302 redirect curl_setopt( $conn [ $k ],CURLOPT_RETURNTRANSFER,1); curl_multi_add_handle ( $mh , $conn [ $k ]); } //防止死循環耗死cpu 這段是根據網上的寫法 do { $mrc = curl_multi_exec( $mh , $active ); //當無數據,active=true } while ( $mrc == CURLM_CALL_MULTI_PERFORM); //當正在接受數據時 while ( $active and $mrc == CURLM_OK) { //當無數據時或請求暫停時,active=true if (curl_multi_select( $mh ) != -1) { do { $mrc = curl_multi_exec( $mh , $active ); } while ( $mrc == CURLM_CALL_MULTI_PERFORM); } } foreach ( $array as $k => $url ) { curl_error( $conn [ $k ]); $res [ $k ]=curl_multi_getcontent( $conn [ $k ]); //獲得返回信息 $header [ $k ]=curl_getinfo( $conn [ $k ]); //返回頭信息 curl_close( $conn [ $k ]); //關閉語柄 curl_multi_remove_handle( $mh , $conn [ $k ]); //釋放資源 } curl_multi_close( $mh ); $endtime = getmicrotime(); $diff_time = $endtime - $startime ; return array ( 'diff_time' => $diff_time , 'return' => $res , 'header' => $header ); } //計算當前時間 function getmicrotime() { list( $usec , $sec ) = explode ( " " ,microtime()); return ((float) $usec + (float) $sec ); } //測試一下,curl 三個網址 $array = array ( " http://www.weibo.com/ " , " http://www.renren.com/ " , " http://www.qq.com/ " ); $data = Curl_http( $array , '10' ); //調用 var_dump( $data ); //輸出 //如果POST的數據大于1024字節,curl并不會直接就發起POST請求 //發送請求時,header中包含一個空的Expect。curl_setopt($ch, CURLOPT_HTTPHEADER, array("Expect:")); ?> |
我們再來看幾個例子
(1)下面這段代碼是實現抓取多個URL,然后將抓取的URL的頁面代碼寫入指定的文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
$urls = array ( ' http://www.jfrwli.cn/ ' , ' http://www.google.com/ ' , ' http://www.example.com/ ' ); // 設置要抓取的頁面URL $save_to = '/test.txt' ; // 把抓取的代碼寫入該文件 $st = fopen ( $save_to , "a" ); $mh = curl_multi_init(); foreach ( $urls as $i => $url ) { $conn [ $i ] = curl_init( $url ); curl_setopt( $conn [ $i ], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ], CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ], CURLOPT_CONNECTTIMEOUT,60); curl_setopt( $conn [ $i ], CURLOPT_FILE, $st ); // 將爬取的代碼寫入文件 curl_multi_add_handle ( $mh , $conn [ $i ]); } // 初始化 do { curl_multi_exec( $mh , $active ); } while ( $active ); // 執行 foreach ( $urls as $i => $url ) { curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); } // 結束清理 curl_multi_close( $mh ); fclose( $st ); |
(2)下面這段代碼和上面差不多意思,只不過這個地方是將獲得的代碼先放入變量,然后再將獲取到的內容寫入指定的文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
$urls = array ( ' http://www.jfrwli.cn/ ' , ' http://www.google.com/ ' , ' http://www.example.com/ ' ); $save_to = '/test.txt' ; // 把抓取的代碼寫入該文件 $st = fopen ( $save_to , "a" ); $mh = curl_multi_init(); foreach ( $urls as $i => $url ) { $conn [ $i ] = curl_init( $url ); curl_setopt( $conn [ $i ], CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ], CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ], CURLOPT_CONNECTTIMEOUT,60); curl_setopt( $conn [ $i ],CURLOPT_RETURNTRANSFER,true); // 不將爬取代碼寫到瀏覽器,而是轉化為字符串 curl_multi_add_handle ( $mh , $conn [ $i ]); } do { curl_multi_exec( $mh , $active ); } while ( $active ); foreach ( $urls as $i => $url ) { $data = curl_multi_getcontent( $conn [ $i ]); // 獲得爬取的代碼字符串 fwrite( $st , $data ); // 將字符串寫入文件 } // 獲得數據變量,并寫入文件 foreach ( $urls as $i => $url ) { curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); } curl_multi_close( $mh ); fclose( $st ); |
(3)下面這段代碼實現的是利用 PHP 的 Curl Functions 實現并發多線程下載文件
1
|
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
$urls = array ( ' http://www.jfrwli.cn/5w.zip ' , ' http://www.jfrwli.cn/5w.zip ' , ' http://www.jfrwli.cn/5w.zip ' ); $save_to = './home/' ; $mh =curl_multi_init(); foreach ( $urls as $i => $url ){ $g = $save_to . basename ( $url ); if (! is_file ( $g )){ $conn [ $i ]=curl_init( $url ); $fp [ $i ]= fopen ( $g , "w" ); curl_setopt( $conn [ $i ],CURLOPT_USERAGENT, "Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ],CURLOPT_FILE, $fp [ $i ]); curl_setopt( $conn [ $i ],CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ],CURLOPT_CONNECTTIMEOUT,60); curl_multi_add_handle( $mh , $conn [ $i ]); } } do { $n =curl_multi_exec( $mh , $active ); } while ( $active ); foreach ( $urls as $i => $url ){ curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); fclose( $fp [ $i ]); } curl_multi_close( $mh ); $urls = array ( ' http://www.jfrwli.cn/5w.zip ' , ' http://www.jfrwli.cn/5w.zip ' , ' http://www.jfrwli.cn/5w.zip ' ); $save_to = './home/' ; $mh =curl_multi_init(); foreach ( $urls as $i => $url ){ $g = $save_to . basename ( $url ); if (! is_file ( $g )){ $conn [ $i ]=curl_init( $url ); $fp [ $i ]= fopen ( $g , "w" ); curl_setopt( $conn [ $i ],CURLOPT_USERAGENT, "Mozilla/4.0(compatible; MSIE 7.0; Windows NT 6.0)" ); curl_setopt( $conn [ $i ],CURLOPT_FILE, $fp [ $i ]); curl_setopt( $conn [ $i ],CURLOPT_HEADER ,0); curl_setopt( $conn [ $i ],CURLOPT_CONNECTTIMEOUT,60); curl_multi_add_handle( $mh , $conn [ $i ]); } } do { $n =curl_multi_exec( $mh , $active ); } while ( $active ); foreach ( $urls as $i => $url ){ curl_multi_remove_handle( $mh , $conn [ $i ]); curl_close( $conn [ $i ]); fclose( $fp [ $i ]); } curl_multi_close( $mh ); |
以上所述就是本文的全部內容了,希望大家能夠喜歡。