国产片侵犯亲女视频播放_亚洲精品二区_在线免费国产视频_欧美精品一区二区三区在线_少妇久久久_在线观看av不卡

腳本之家,腳本語言編程技術及教程分享平臺!
分類導航

Python|VBS|Ruby|Lua|perl|VBA|Golang|PowerShell|Erlang|autoit|Dos|bat|

服務器之家 - 腳本之家 - Python - Python多線程、異步+多進程爬蟲實現代碼

Python多線程、異步+多進程爬蟲實現代碼

2020-08-12 09:53Python教程網 Python

這篇文章主要介紹了Python多線程、異步+多進程爬蟲實現代碼,需要的朋友可以參考下

安裝Tornado
省事點可以直接用grequests庫,下面用的是tornado的異步client。 異步用到了tornado,根據官方文檔的例子修改得到一個簡單的異步爬蟲類??梢詤⒖枷伦钚碌奈臋n學習下。
pip install tornado

異步爬蟲

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
# -*- coding:utf-8 -*-
 
import time
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
import traceback
 
 
class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency=10, **kwargs):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()
 
  def fetch(self, url, **kwargs):
    fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
    return fetch(url, **kwargs)
 
  def handle_html(self, url, html):
    """handle html page"""
    print(url)
 
  def handle_response(self, url, response):
    """inherit and rewrite this method"""
    if response.code == 200:
      self.handle_html(url, response.body)
 
    elif response.code == 599# retry
      self._fetching.remove(url)
      self._q.put(url)
 
  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield self.fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return(e)
    raise gen.Return(response)
 
  @gen.coroutine
  def _run(self):
    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return
 
        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)
 
        response = yield self.get_page(current_url)
        self.handle_response(current_url, response)  # handle reponse
 
        self._fetched.add(current_url)
 
        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())
 
      finally:
        self._q.task_done()
 
    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()
 
    self._q.put(self.urls.pop())  # add first url
 
    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()
 
    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched
 
  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)
 
 
class MySpider(AsySpider):
 
  def fetch(self, url, **kwargs):
    """重寫父類fetch方法可以添加cookies,headers,timeout等信息"""
    cookies_str = "PHPSESSID=j1tt66a829idnms56ppb70jri4; pspt=%7B%22id%22%3A%2233153%22%2C%22pswd%22%3A%228835d2c1351d221b4ab016fbf9e8253f%22%2C%22_code%22%3A%22f779dcd011f4e2581c716d1e1b945861%22%7D; key=%E9%87%8D%E5%BA%86%E5%95%84%E6%9C%A8%E9%B8%9F%E7%BD%91%E7%BB%9C%E7%A7%91%E6%8A%80%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8; think_language=zh-cn; SERVERID=a66d7d08fa1c8b2e37dbdc6ffff82d9e|1444973193|1444967835; CNZZDATA1254842228=1433864393-1442810831-%7C1444972138"  # 從瀏覽器拷貝cookie字符串
    headers = {
      'User-Agent': 'mozilla/5.0 (compatible; baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
      'cookie': cookies_str
    }
    return super(MySpider, self).fetch(  # 參數參考tornado文檔
      url, headers=headers, request_timeout=1
    )
 
  def handle_html(self, url, html):
    print(url, html)
 
 
def main():
  urls = []
  for page in range(1, 100):
    urls.append('http://www.baidu.com?page=%s' % page)
  s = MySpider(urls)
  s.run()
 
 
if __name__ == '__main__':
  main()

可以繼承這個類,塞一些url進去,然后重寫handle_page處理得到的頁面。

異步+多進程爬蟲
還可以再變態點,加個進程池,使用了multiprocessing模塊。效率颼颼的,

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
# -*- coding:utf-8 -*-
 
import time
from multiprocessing import Pool
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
 
 
class AsySpider(object):
  """A simple class of asynchronous spider."""
  def __init__(self, urls, concurrency):
    urls.reverse()
    self.urls = urls
    self.concurrency = concurrency
    self._q = queues.Queue()
    self._fetching = set()
    self._fetched = set()
 
  def handle_page(self, url, html):
    filename = url.rsplit('/', 1)[1]
    with open(filename, 'w+') as f:
      f.write(html)
 
  @gen.coroutine
  def get_page(self, url):
    try:
      response = yield httpclient.AsyncHTTPClient().fetch(url)
      print('######fetched %s' % url)
    except Exception as e:
      print('Exception: %s %s' % (e, url))
      raise gen.Return('')
    raise gen.Return(response.body)
 
  @gen.coroutine
  def _run(self):
 
    @gen.coroutine
    def fetch_url():
      current_url = yield self._q.get()
      try:
        if current_url in self._fetching:
          return
 
        print('fetching****** %s' % current_url)
        self._fetching.add(current_url)
        html = yield self.get_page(current_url)
        self._fetched.add(current_url)
 
        self.handle_page(current_url, html)
 
        for i in range(self.concurrency):
          if self.urls:
            yield self._q.put(self.urls.pop())
 
      finally:
        self._q.task_done()
 
    @gen.coroutine
    def worker():
      while True:
        yield fetch_url()
 
    self._q.put(self.urls.pop())
 
    # Start workers, then wait for the work queue to be empty.
    for _ in range(self.concurrency):
      worker()
    yield self._q.join(timeout=timedelta(seconds=300000))
    assert self._fetching == self._fetched
 
  def run(self):
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(self._run)
 
 
def run_spider(beg, end):
  urls = []
  for page in range(beg, end):
    urls.append('http://127.0.0.1/%s.htm' % page)
  s = AsySpider(urls, 10)
  s.run()
 
 
def main():
  _st = time.time()
  p = Pool()
  all_num = 73000
  num = 4  # number of cpu cores
  per_num, left = divmod(all_num, num)
  s = range(0, all_num, per_num)
  res = []
  for i in range(len(s)-1):
    res.append((s[i], s[i+1]))
  res.append((s[len(s)-1], all_num))
  print res
 
  for i in res:
    p.apply_async(run_spider, args=(i[0], i[1],))
  p.close()
  p.join()
 
  print time.time()-_st
 
 
if __name__ == '__main__':
  main()

多線程爬蟲
線程池實現.

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import Queue
import sys
import requests
import os
import threading
import time
 
class Worker(threading.Thread):  # 處理工作請求
  def __init__(self, workQueue, resultQueue, **kwds):
    threading.Thread.__init__(self, **kwds)
    self.setDaemon(True)
    self.workQueue = workQueue
    self.resultQueue = resultQueue
 
 
  def run(self):
    while 1:
      try:
        callable, args, kwds = self.workQueue.get(False# get task
        res = callable(*args, **kwds)
        self.resultQueue.put(res)  # put result
      except Queue.Empty:
        break
 
class WorkManager:  # 線程池管理,創建
  def __init__(self, num_of_workers=10):
    self.workQueue = Queue.Queue()  # 請求隊列
    self.resultQueue = Queue.Queue()  # 輸出結果的隊列
    self.workers = []
    self._recruitThreads(num_of_workers)
 
  def _recruitThreads(self, num_of_workers):
    for i in range(num_of_workers):
      worker = Worker(self.workQueue, self.resultQueue)  # 創建工作線程
      self.workers.append(worker)  # 加入到線程隊列
 
 
  def start(self):
    for w in self.workers:
      w.start()
 
  def wait_for_complete(self):
    while len(self.workers):
      worker = self.workers.pop()  # 從池中取出一個線程處理請求
      worker.join()
      if worker.isAlive() and not self.workQueue.empty():
        self.workers.append(worker)  # 重新加入線程池中
    print 'All jobs were complete.'
 
 
  def add_job(self, callable, *args, **kwds):
    self.workQueue.put((callable, args, kwds))  # 向工作隊列中加入請求
 
  def get_result(self, *args, **kwds):
    return self.resultQueue.get(*args, **kwds)
 
 
def download_file(url):
  #print 'beg download', url
  requests.get(url).text
 
 
def main():
  try:
    num_of_threads = int(sys.argv[1])
  except:
    num_of_threads = 10
  _st = time.time()
  wm = WorkManager(num_of_threads)
  print num_of_threads
  urls = ['http://www.baidu.com'] * 1000
  for i in urls:
    wm.add_job(download_file, i)
  wm.start()
  wm.wait_for_complete()
  print time.time() - _st
 
if __name__ == '__main__':
  main()

這三種隨便一種都有很高的效率,但是這么跑會給網站服務器不小的壓力,尤其是小站點,還是有點節操為好。

延伸 · 閱讀

精彩推薦
Weibo Article 1 Weibo Article 2 Weibo Article 3 Weibo Article 4 Weibo Article 5 Weibo Article 6 Weibo Article 7 Weibo Article 8 Weibo Article 9 Weibo Article 10 Weibo Article 11 Weibo Article 12 Weibo Article 13 Weibo Article 14 Weibo Article 15 Weibo Article 16 Weibo Article 17 Weibo Article 18 Weibo Article 19 Weibo Article 20 Weibo Article 21 Weibo Article 22 Weibo Article 23 Weibo Article 24 Weibo Article 25 Weibo Article 26 Weibo Article 27 Weibo Article 28 Weibo Article 29 Weibo Article 30 Weibo Article 31 Weibo Article 32 Weibo Article 33 Weibo Article 34 Weibo Article 35 Weibo Article 36 Weibo Article 37 Weibo Article 38 Weibo Article 39 Weibo Article 40
主站蜘蛛池模板: 免费裸体视频网站 | 日本一区二区三区精品视频在线观看 | 伊人婷婷| 国产一级纯肉体一级毛片 | 九九热视频精品在线观看 | 自拍偷拍在线视频 | jav久久亚洲欧美精品 | 国产免费一区二区三区 | 热久久国产 | av免费影视 | 91精选视频在线观看 | 午夜激情视频网站 | 国产精品美女久久久久aⅴ国产馆 | 亚洲国产精品激情在线观看 | 欧美日韩精品一区二区三区蜜桃 | 久久精品中文字幕大胸 | av一区二区三区 | 一级视频在线播放 | 成人亚洲一区 | 成人一区二区三区久久精品嫩草 | 欧美精品v国产精品v日韩精品 | 日韩成人在线播放 | 亚洲大奶网 | 国产精品毛片久久久 | 国产日韩欧美 | 成人久久久精品国产乱码一区二区 | 日韩高清在线观看 | 精品一区二区电影 | 黄色裸体网站 | 亚洲不卡高清视频 | 欧美激情视频一区二区三区在线播放 | 中文字幕1区 | 免费观看a视频 | 免费成人av网站 | 91精品国产高清久久久久久久久 | 黄色毛片在线观看 | 精品一区视频 | 久久精品一区二区三区中文字幕 | 丁香久久| 久久免费视频3 | 亚洲视频在线观看 |