1,使用xpath清理不必要的標(biāo)簽元素,以及無內(nèi)容標(biāo)簽
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
from lxml import etree def xpath_clean( self , text: str , xpath_dict: dict ) - > str : ''' xpath 清除不必要的元素 :param text: html_content :param xpath_dict: 清除目標(biāo)xpath :return: string type html_content ''' remove_by_xpath = xpath_dict if xpath_dict else dict () # 必然清除的項(xiàng)目 除非極端情況 一般這些都是要清除的 remove_by_xpath.update({ '_remove_2' : '//iframe' , '_remove_4' : '//button' , '_remove_5' : '//form' , '_remove_6' : '//input' , '_remove_7' : '//select' , '_remove_8' : '//option' , '_remove_9' : '//textarea' , '_remove_10' : '//figure' , '_remove_11' : '//figcaption' , '_remove_12' : '//frame' , '_remove_13' : '//video' , '_remove_14' : '//script' , '_remove_15' : '//style' }) parser = etree.HTMLParser(remove_blank_text = True , remove_comments = True ) selector = etree.HTML(text, parser = parser) # 常規(guī)刪除操作,不需要的標(biāo)簽刪除 for xpath in remove_by_xpath.values(): for bad in selector.xpath(xpath): bad_string = etree.tostring(bad, encoding = 'utf-8' , pretty_print = True ).decode() logger.debug(f "clean article content : {bad_string}" ) bad.getparent().remove(bad) skip_tip = "name()='img' or name()='tr' or " \ "name()='th' or name()='tbody' or " \ "name()='thead' or name()='table'" # 判斷所有p標(biāo)簽,是否有內(nèi)容存在,沒有的直接刪除 for p in selector.xpath(f "//*[not({skip_tip})]" ): # 跳過邏輯 if p.xpath(f ".//*[{skip_tip}]" ) or \ bool (re.sub( '\s' , ' ', p.xpath(' string(.)'))): continue bad_p = etree.tostring(p, encoding = 'utf-8' , pretty_print = True ).decode() logger.debug(f "clean p tag : {bad_p}" ) p.getparent().remove(p) return etree.tostring(selector, encoding = 'utf-8' , pretty_print = True ).decode() |
2,使用pyquery清理標(biāo)簽屬性,并返回處理后源碼和純凈文本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#!/usr/bin/env python # -*-coding:utf-8-*- from pyquery import PyQuery as pq def pyquery_clean( self , text, url, pq_dict) - > object : ''' pyquery 做出必要的處理, :param text: :param url: :param pq_dict: :return: ''' # 刪除pq表達(dá)式字典 remove_by_pq = pq_dict if pq_dict else dict () # 標(biāo)簽屬性白名單 attr_white_list = [ 'rowspan' , 'colspan' ] # 圖片鏈接key img_key_list = [ 'src' , 'data-echo' , 'data-src' , 'data-original' ] # 生成pyquery對(duì)象 dom = pq(text) # 刪除無用標(biāo)簽 for bad_tag in remove_by_pq.values(): for bad in dom(bad_tag): bad_string = pq(bad).html() logger.debug(f "clean article content : {bad_string}" ) dom.remove(bad_tag) # 標(biāo)簽各個(gè)屬性處理 for tag in dom( '*' ): for key, value in tag.attrib.items(): # 跳過邏輯,保留表格的rowspan和colspan屬性 if key in attr_white_list: continue # 處理圖片鏈接,不完整url,補(bǔ)充完整后替換 if key in img_key_list: img_url = self .absolute_url(url, value) pq(tag).remove_attr(key) pq(tag).attr( 'src' , img_url) pq(tag).attr( 'alt' , '') # img標(biāo)簽的alt屬性保留為空 elif key = = 'alt' : pq(tag).attr(key, '') # 其余所有屬性做刪除操作 else : pq(tag).remove_attr(key) return dom.text(), dom.html() |
3,正則表達(dá)清理空格以及換行符內(nèi)容
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
#!/usr/bin/env python # -*-coding:utf-8-*- import re def regular_clean( self , str1: str , str2: str ): ''' 正則表達(dá)式處理數(shù)據(jù)格式 :param str1: content :param str2: html_content :return: 返回處理后的結(jié)果 ''' def new_line(text): text = re.sub( '<br\s?/?>' , '<br>' , text) text = re.sub( '</?a>|</?em>|</?html>|</?body>|' '</?head>|<[a-zA-Z]{1,10}\s?/>|' '</?strong>|</?blockquote>|</?b>|' '</?span>|</?i>|</?hr>|</?font>' , '', text) text = re.sub( '\n' , '', text) text = re.sub( '<h[1-6]>' , '<p>' , text) text = re.sub( '</h[1-6]>' , '</p>' , text) text = text.replace( '</p>' , '</p>\n' ).replace( '<br>' , '<br/>' ) return text str1, str2 = self .clean_blank(str1), self .clean_blank(str2) # TODO 處理空白行問題 # TODO html_content處理 1,刪除多余的無法使用的標(biāo)簽以及影響數(shù)據(jù)展示的標(biāo)簽 2,換行符問題處理以及更換 str2 = new_line(text = str2) return str1, str2 |
結(jié)尾部分,各個(gè)方法封裝類代碼展示
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
|
#!/usr/bin/env python # -*-coding:utf-8-*- ''' author: szhan date:2020-08-17 summery: 清理html_conent以及獲取純凈數(shù)據(jù)格式 ''' import re from lxml import etree from pyquery import PyQuery as pq from urllib.parse import urlsplit, urljoin from loguru import logger class CleanArticle: def __init__( self , text: str , url: str = '', xpath_dict: dict = None , pq_dict: dict = None ): self .text = text self .url = url self .xpath_dict = xpath_dict or dict () self .pq_dict = pq_dict or dict () @staticmethod def absolute_url(baseurl: str , url: str ) - > str : ''' 補(bǔ)充url :param baseurl:scheme url :param url: target url :return: complete url ''' target_url = url if urlsplit(url).scheme else urljoin(baseurl, url) return target_url @staticmethod def clean_blank(text): ''' 空白處理 :param text: :return: ''' text = text.replace( ' ' , ' ').replace(' \u3000 ', ' ').replace(' \t ', ' ').replace(' \xa0 ', ' ') text = re.sub( '\s{2,}' , '', text) text = re.sub( '\n{2,}' , '\n' , text) text = text.strip( '\n' ).strip() return text def run( self ): ''' :return:處理后的content, html_content ''' if ( not bool ( self .text)) or ( not isinstance ( self .text, str )): raise ValueError( 'html_content has a bad type value' ) # 首先,使用xpath去除空格,以及注釋,iframe, button, form, script, style, video等標(biāo)簽 text = self .xpath_clean( self .text, self .xpath_dict) # 第二步,使用pyquery處理具體細(xì)節(jié)方面 str1, str2 = self .pyquery_clean(text, self .url, self .pq_dict) # 最終的正則處理 content, html_content = self .regular_clean(str1, str2) return content, html_content def xpath_clean( self , text: str , xpath_dict: dict ) - > str : ''' xpath 清除不必要的元素 :param text: html_content :param xpath_dict: 清除目標(biāo)xpath :return: string type html_content ''' remove_by_xpath = xpath_dict if xpath_dict else dict () # 必然清除的項(xiàng)目 除非極端情況 一般這些都是要清除的 remove_by_xpath.update({ '_remove_2' : '//iframe' , '_remove_4' : '//button' , '_remove_5' : '//form' , '_remove_6' : '//input' , '_remove_7' : '//select' , '_remove_8' : '//option' , '_remove_9' : '//textarea' , '_remove_10' : '//figure' , '_remove_11' : '//figcaption' , '_remove_12' : '//frame' , '_remove_13' : '//video' , '_remove_14' : '//script' , '_remove_15' : '//style' }) parser = etree.HTMLParser(remove_blank_text = True , remove_comments = True ) selector = etree.HTML(text, parser = parser) # 常規(guī)刪除操作,不需要的標(biāo)簽刪除 for xpath in remove_by_xpath.values(): for bad in selector.xpath(xpath): bad_string = etree.tostring(bad, encoding = 'utf-8' , pretty_print = True ).decode() logger.debug(f "clean article content : {bad_string}" ) bad.getparent().remove(bad) skip_tip = "name()='img' or name()='tr' or " \ "name()='th' or name()='tbody' or " \ "name()='thead' or name()='table'" # 判斷所有p標(biāo)簽,是否有內(nèi)容存在,沒有的直接刪除 for p in selector.xpath(f "//*[not({skip_tip})]" ): # 跳過邏輯 if p.xpath(f ".//*[{skip_tip}]" ) or \ bool (re.sub( '\s' , ' ', p.xpath(' string(.)'))): continue bad_p = etree.tostring(p, encoding = 'utf-8' , pretty_print = True ).decode() logger.debug(f "clean p tag : {bad_p}" ) p.getparent().remove(p) return etree.tostring(selector, encoding = 'utf-8' , pretty_print = True ).decode() def pyquery_clean( self , text, url, pq_dict) - > object : ''' pyquery 做出必要的處理, :param text: :param url: :param pq_dict: :return: ''' # 刪除pq表達(dá)式字典 remove_by_pq = pq_dict if pq_dict else dict () # 標(biāo)簽屬性白名單 attr_white_list = [ 'rowspan' , 'colspan' ] # 圖片鏈接key img_key_list = [ 'src' , 'data-echo' , 'data-src' , 'data-original' ] # 生成pyquery對(duì)象 dom = pq(text) # 刪除無用標(biāo)簽 for bad_tag in remove_by_pq.values(): for bad in dom(bad_tag): bad_string = pq(bad).html() logger.debug(f "clean article content : {bad_string}" ) dom.remove(bad_tag) # 標(biāo)簽各個(gè)屬性處理 for tag in dom( '*' ): for key, value in tag.attrib.items(): # 跳過邏輯,保留表格的rowspan和colspan屬性 if key in attr_white_list: continue # 處理圖片鏈接,不完整url,補(bǔ)充完整后替換 if key in img_key_list: img_url = self .absolute_url(url, value) pq(tag).remove_attr(key) pq(tag).attr( 'src' , img_url) pq(tag).attr( 'alt' , '') # img標(biāo)簽的alt屬性保留為空 elif key = = 'alt' : pq(tag).attr(key, '') # 其余所有屬性做刪除操作 else : pq(tag).remove_attr(key) return dom.text(), dom.html() def regular_clean( self , str1: str , str2: str ): ''' 正則表達(dá)式處理數(shù)據(jù)格式 :param str1: content :param str2: html_content :return: 返回處理后的結(jié)果 ''' def new_line(text): text = re.sub( '<br\s?/?>' , '<br>' , text) text = re.sub( '</?a>|</?em>|</?html>|</?body>|' '</?head>|<[a-zA-Z]{1,10}\s?/>|' '</?strong>|</?blockquote>|</?b>|' '</?span>|</?i>|</?hr>|</?font>' , '', text) text = re.sub( '\n' , '', text) text = re.sub( '<h[1-6]>' , '<p>' , text) text = re.sub( '</h[1-6]>' , '</p>' , text) text = text.replace( '</p>' , '</p>\n' ).replace( '<br>' , '<br/>' ) return text str1, str2 = self .clean_blank(str1), self .clean_blank(str2) # TODO 處理空白行問題 # TODO html_content處理 1,刪除多余的無法使用的標(biāo)簽以及影響數(shù)據(jù)展示的標(biāo)簽 2,換行符問題處理以及更換 str2 = new_line(text = str2) return str1, str2 if __name__ = = '__main__' : with open ( 'html_content.html' , 'r' , encoding = 'utf-8' ) as f: lines = f.readlines() html = '' for line in lines: html + = line ca = CleanArticle(text = html) _, html_content = ca.run() print (html_content) |
總結(jié)
到此這篇關(guān)于基于xpath選擇器、PyQuery、正則表達(dá)式的格式清理工具詳解的文章就介紹到這了,更多相關(guān)PyQuery、正則表達(dá)式的格式清理工具內(nèi)容請(qǐng)搜索服務(wù)器之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持服務(wù)器之家!
原文鏈接:https://blog.csdn.net/weixin_37128372/article/details/108340853