核心代碼
requests.get 下載html網(wǎng)頁
bs4.BeautifulSoup 分析html內(nèi)容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
from requests import get from bs4 import BeautifulSoup as bs from datetime import datetime as dt def Today(style = 1 ): date = dt.today() if style! = 1 : return f '{date.month}月{date.day}日' return f '{date.year}-{date.month:02}-{date.day:02}' def SinaNews(style = 1 ): url1 = 'http://news.***.com.cn/' if style = = 1 : url1 + = 'world' elif style = = 2 : url1 + = 'china' else : url1 = 'https://mil.news.sina.com.cn/' text = get(url1) text.encoding = 'uft-8' soup = bs(text.text, 'html.parser' ) aTags = soup.find_all( "a" ) return [(t.text,t[ 'href' ]) for t in aTags if Today() in str (t)] |
爬取標(biāo)題
1
2
3
4
5
6
7
8
9
|
for i,news in enumerate (SinaNews( 1 )): print (f 'No{i+1}:' ,news[ 0 ]) No1: 外媒: * * * * * No2: 日媒: * * * * * * ...... ...... |
內(nèi)容已馬賽克!!!
首次做爬蟲,為了方便下手找一個(gè)不用破解網(wǎng)頁的某新聞網(wǎng)站,下載網(wǎng)頁就能直接取得內(nèi)容。其中的國(guó)際、國(guó)內(nèi)和軍事新聞三個(gè)網(wǎng)頁作內(nèi)容源,requests.get下載網(wǎng)頁后,分析所得html文本,所有<a href=...>標(biāo)記帶日期剛好所需要的。
爬取正文
然后再根據(jù)url下載正文網(wǎng)頁,分析可知id=‘article'的<div>層就是正文所在位置,.get_text()是取得文本的關(guān)鍵函數(shù),然后適當(dāng)做一些格式處理:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
>>> def NewsDownload(url): html = get(url) html.encoding = 'uft-8' soup = bs(html.text, 'html.parser' ) text = soup.find( 'div' , id = 'article' ).get_text().strip() text = text.replace( '點(diǎn)擊進(jìn)入專題:' , '相關(guān)專題:' ) text = text.replace( ' ' , '\n ' ) while '\n\n\n' in text: text = text.replace( '\n\n\n' , '\n\n' ) return text >>> url = 'https://******/w/2021-09-29/doc-iktzqtyt8811588.shtml' >>> NewsDownload(url) '原標(biāo)題:******************************************************' >>> |
界面代碼
使用內(nèi)置的圖形界面庫 tkinter 控件 Text 、Listbox、Scrollbar、Button。設(shè)置基本屬性、放置位置、綁定命令,然后調(diào)試到程序完工!
源代碼 News.pyw :其中涉及的網(wǎng)站名稱已馬賽克!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
from requests import get from bs4 import BeautifulSoup as bs from datetime import datetime as dt from os import path import tkinter as tk def Today(style = 1 ): date = dt.today() if style! = 1 : return f '{date.month}月{date.day}日' return f '{date.year}-{date.month:02}-{date.day:02}' def SinaNews(style = 1 ): url1 = 'http://news.****.com.cn/' if style = = 1 : url1 + = 'world' elif style = = 2 : url1 + = 'china' else : url1 = 'https://mil.****.com.cn/' text = get(url1) text.encoding = 'uft-8' soup = bs(text.text, 'html.parser' ) aTags = soup.find_all( "a" ) return [(t.text,t[ 'href' ]) for t in aTags if Today() in str (t)] def NewsList(i): global news news = SinaNews(i) tList.delete( 0 ,tk.END) for idx,item in enumerate (news): tList.insert(tk.END,f '{idx+1:03} {item[0]}' ) tText.config(state = tk.NORMAL) tText.delete( 0.0 ,tk.END) tText.config(state = tk.DISABLED) NewsShow( 0 ) def NewsList1(): NewsList( 1 ) def NewsList2(): NewsList( 2 ) def NewsList3(): NewsList( 3 ) def NewsShow(idx): if idx! = 0 : idx = tList.curselection()[ 0 ] title,url = news[idx][ 0 ],news[idx][ 1 ] html = get(url) html.encoding = 'uft-8' soup = bs(html.text, 'html.parser' ) text = soup.find( 'div' , id = 'article' ).get_text().strip() text = text.replace( '點(diǎn)擊進(jìn)入專題:' , '相關(guān)專題:' ) text = text.replace( ' ' , '\n ' ) while '\n\n\n' in text: text = text.replace( '\n\n\n' , '\n\n' ) tText.config(state = tk.NORMAL) tText.delete( 0.0 ,tk.END) tText.insert(tk.END, title + '\n\n' + text) tText.config(state = tk.DISABLED) def InitWindow( self ,W,H): Y = self .winfo_screenheight() winPosition = str (W) + 'x' + str (H) + '+8+' + str (Y - H - 100 ) self .geometry(winPosition) icoFile = 'favicon.ico' f = path.exists(icoFile) if f: win.iconbitmap(icoFile) self .resizable( False , False ) self .wm_attributes( '-topmost' , True ) self .title(bTitle[ 0 ]) SetControl() self .update() self .mainloop() def SetControl(): global tList,tText tScroll = tk.Scrollbar(win, orient = tk.VERTICAL) tScroll.place(x = 450 ,y = 320 ,height = 300 ) tList = tk.Listbox(win,selectmode = tk.BROWSE,yscrollcommand = tScroll. set ) tScroll.config(command = tList.yview) for idx,item in enumerate (news): tList.insert(tk.END,f '{idx+1:03} {item[0]}' ) tList.place(x = 15 ,y = 320 ,width = 435 ,height = 300 ) tList.select_set( 0 ) tList.focus() bW,bH = 70 , 35 #按鈕的寬高 bX,bY = 95 , 270 #按鈕的坐標(biāo) tBtn1 = tk.Button(win,text = bTitle[ 1 ],command = NewsList1) tBtn1.place(x = bX,y = bY,width = bW,height = bH) tBtn2 = tk.Button(win,text = bTitle[ 2 ],command = NewsList2) tBtn2.place(x = bX + 100 ,y = bY,width = bW,height = bH) tBtn3 = tk.Button(win,text = bTitle[ 3 ],command = NewsList3) tBtn3.place(x = bX + 200 ,y = bY,width = bW,height = bH) tScroll2 = tk.Scrollbar(win, orient = tk.VERTICAL) tScroll2.place(x = 450 ,y = 10 ,height = 240 ) tText = tk.Text(win,yscrollcommand = tScroll2. set ) tScroll2.config(command = tText.yview) tText.place(x = 15 ,y = 10 ,width = 435 ,height = 240 ) tText.config(state = tk.DISABLED,bg = 'azure' ,font = ( '宋體' , '14' )) NewsShow( 0 ) tList.bind( "<Double-Button-1>" ,NewsShow) if __name__ = = '__main__' : win = tk.Tk() bTitle = ( '今日新聞' , '國(guó)際新聞' , '國(guó)內(nèi)新聞' , '軍事新聞' ) news = SinaNews() InitWindow(win, 480 , 640 ) |
奉上全部代碼,在此就不作詳細(xì)分析了,如有需要請(qǐng)留言討論。我的使用環(huán)境 Win7+Python3.8.8 下可以無錯(cuò)運(yùn)行!文中涉及網(wǎng)站名稱已打上馬賽克,猜不出名字的可以私下里問我。
軟件編譯
使用pyinstaller.exe編譯成單個(gè)運(yùn)行文件,注意源碼文件的后綴名應(yīng)該用.pyw否則會(huì)有cmd黑窗口出現(xiàn)。還有一個(gè)小知識(shí)點(diǎn),任意網(wǎng)站的Logo圖標(biāo)icon文件,一般都能在根目錄里下載到,即:
http(s)://websiteurl.com(.cn)/favicon.ico
編譯命令如下:
D:\>pyinstaller --onefile --nowindowed --icon="D:\favicon.ico" News.pyw
編譯完成后,在dist文件夾下生成一個(gè)News.exe可執(zhí)行文件,大小約15M還能接受。
反正拿走就能直接用
以上就是Python小程序爬取今日新聞拿走就能用的詳細(xì)內(nèi)容,更多關(guān)于Python小程序的資料請(qǐng)關(guān)注服務(wù)器之家其它相關(guān)文章!
原文鏈接:https://blog.csdn.net/boysoft2002/article/details/120549021