在學(xué)習(xí)python的過程中,經(jīng)過不斷的嘗試及努力,終于完成了第一個像樣的python程序,雖然還有很多需要優(yōu)化的地方,但是目前基本上實(shí)現(xiàn)了我所要求的功能,先貼一下程序代碼:
具體代碼如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
#! /usr/bin/python import os,urllib2,re,time,MySQLdb,sys reTitle = re. compile ( '<font[^>]*>(.*?)<\/font><font[^>]*' ) reNeiron = re. compile ( '[1-9|A-Z|a-z].*' ) retiqu = re. compile ( '^(?!MARGINWIDTH|BR).*.[^>|}]$' ) rezhong = re. compile ( '^[^[].*' ) shijian = 1190944000 Str1 = "\\n---------------- BLOG OF YAO" bianhao = 2859 for i in range ( 1 , 1500 ): Str2 = "" ltime = time.localtime(shijian) timeStr = time.strftime( "%Y%m%d" ,ltime) url = "http://www.jokeswarehouse.com/cgi-bin/viewjoke2.cgi?id=%s" % timeStr print url a = urllib2.urlopen(url).read() Title = reTitle.findall(a) print "==========================================================================================================" for titles in map ( None ,Title): titles = MySQLdb.escape_string(titles) print titles Neiron = re.findall(reNeiron,a) for i in map ( None ,Neiron): x = re.findall(retiqu,i) for str in x: str = MySQLdb.escape_string( str ) Str2 + = str + "\\n" shijian + = 86400 bianhao + = 1 try : conn = MySQLdb.connect( "XXXX.XXXX.XXXX.XXXX" , "user" , "passwd" , "dbname" ,charset = "utf8" , init_command = "set names utf8" ) except MySQLdb.OperationalError,message: print "like error" cursor = conn.cursor() sql = "INSERT INTO wp_posts (post_author,post_date,post_date_gmt,post_content,post_content_filtered,post_title,post_excerpt,post_status,post_type,comment_status,ping_status,post_password,post_name,to_ping,pinged,post_modified,post_modified_gmt,post_parent,menu_order,guid) VALUES (\'1\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'\',\'\',\'Auto Draft\',\'\',\'inherit\',\'revision\',\'open\',\'open\',\'\',\'100-revision\',\'\',\'\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'%s\',\'0\',\'\')" % bianhao sql2 = "UPDATE wp_posts SET post_author = 1, post_date = \'2011-06-01 22:12:25\', post_date_gmt = \'2011-06-01 22:12:25\', post_content =\'%s\', post_content_filtered = \'\', post_title = \'%s\', post_excerpt = \'\', post_status = \'publish\', post_type = \'post\', comment_status = \'open\', ping_status = \'open\', post_password = \'\', post_name = \'%s\', to_ping = \'\', pinged = \'\', post_modified = \'2011-06-01 22:12:25\', post_modified_gmt = \'2011-05-09 04:12:30\', post_parent = 0, menu_order = 0, guid = \'http://www.moncleronlineshops.com/?p=%s\' WHERE ID = %s" % (Str2,titles,titles,bianhao,bianhao) cursor.execute(sql) cursor.execute(sql2) cursor.close() conn.close() sys.exit() |
下面,我們來給代碼加些注釋,讓讀者能看的更明白一些,如下:
具體代碼如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
#! /usr/bin/python import os,urllib2,re,time,MySQLdb,sys #加載本程序需要調(diào)用的相模塊 reTitle = re. compile ( '<font[^>]*>(.*?)<\/font> <font[^>]*' ) # 定義一下取文章標(biāo)題的正則 reNeiron = re. compile ( '[1-9|A-Z|a-z].*' ) #定義一個取提取文章內(nèi)容的正則(注:這里提取出來的不是很精細(xì),需要在下面的正則里,再進(jìn)行提取,這里只是取一個大概) retiqu = re. compile ( '^(?!MARGINWIDTH|BR).*.[^>|}]$' ) #這里定義一個正則,將上面reNeiron提取出來的字符,再進(jìn)行細(xì)化。 shijian = 1190944000 #這里字義了一個時間戳, Str1 = "\\n---------------- BLOG OF YAO" #這個沒用,開始是準(zhǔn)備加到文章里的,后來沒加進(jìn)去。 bianhao = 2859 #這里是wordpress 的文章編號,直接查看wp-posts表的id 字段的最后一個數(shù)字。 for i in range ( 1 , 1500 ): #循環(huán)1500遍,也就是采集1500篇文章。 Str2 = "" #先賦值給Str2 空值 ltime = time.localtime(shijian) timeStr = time.strftime( "%Y%m%d" ,ltime) #這兩句將上面的時間戳改為時間,樣式為19700101這樣的格式 url = "http://www.jokeswarehouse.com/cgi-bin/viewjoke2.cgi?id=%s" % timeStr #定義要采集的網(wǎng)站,將轉(zhuǎn)化后的時間放在這個url的最后。 a = urllib2.urlopen(url).read() #將這個網(wǎng)頁的源代碼讀出來,賦值給a; Title = reTitle.findall(a) #使用 reTitle這個正則提取出標(biāo)題 print "==========================================================================================================" for titles in map ( None ,Title): #上面提取出來的標(biāo)題前后都有一個 [] 所以我們要寫個 for 循環(huán)把前后的[]去掉,并轉(zhuǎn)義成能直接插入mysql庫的格式。 titles = MySQLdb.escape_string(titles) Neiron = re.findall(reNeiron,a) #先用reNeiron,取個大概的內(nèi)容模型出來。這些都是以逗號分隔的數(shù)組。 for i in map ( None ,Neiron): # 我們來循環(huán)讀出Neiron這個數(shù)組里的每個值。 x = re.findall(retiqu,i) #并用 retiqu這個正則提出精細(xì)出的內(nèi)容。 for str in x: str = MySQLdb.escape_string( str ) Str2 + = str + "\\n" #利用這個循環(huán),我們把內(nèi)容加到一起,并賦值給Str2這個變量,這個 Str2這個變量就是所有的文章內(nèi)容。 shijian + = 86400 #每循環(huán)一次,就把shijian這個變量加上一天。 bianhao + = 1 #每循環(huán)一次,就把bianhao這個變量加上一 try : #下面是用mysqldb連接數(shù)據(jù)庫,并嘗試連接是否成功。 conn=MySQLdb.connect("XXXX.XXXX.XXXX.XXXX","user","passwd","dbname",charset="utf8", init_command="set names utf8") except MySQLdb.OperationalError,message: print "like error" cursor = conn.cursor() #下面是插入wordpress數(shù)據(jù)庫的兩條語句,我是從mysqlbinlog里面導(dǎo)出來的,測試是可以插入數(shù)據(jù)庫,并能正常把內(nèi)容顯示在網(wǎng)頁的。變量都寫在這兩條語句里。 sql = "INSERT INTO wp_posts (post_author,post_date,post_date_gmt,post_content,post_content_filtered,post_title,post_excerpt,post_status,post_type,comment_status,ping_status,post_password,post_name,to_ping,pinged,post_modified,post_modified_gmt,post_parent,menu_order,guid) VALUES (\'1\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'\',\'\',\'Auto Draft\',\'\',\'inherit\',\'revision\',\'open\',\'open\',\'\',\'100-revision\',\'\',\'\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'%s\',\'0\',\'\')" % bianhao sql2 = "UPDATE wp_posts SET post_author = 1, post_date = \'2011-06-01 22:12:25\', post_date_gmt = \'2011-06-01 22:12:25\', post_content =\'%s\', post_content_filtered = \'\', post_title = \'%s\', post_excerpt = \'\', post_status = \'publish\', post_type = \'post\', comment_status = \'open\', ping_status = \'open\', post_password = \'\', post_name = \'%s\', to_ping = \'\', pinged = \'\', post_modified = \'2011-06-01 22:12:25\', post_modified_gmt = \'2011-05-09 04:12:30\', post_parent = 0, menu_order = 0, guid = \'http://www.moncleronlineshops.com/?p=%s\' WHERE ID = %s" % (Str2,titles,titles,bianhao,bianhao) cursor.execute(sql) cursor.execute(sql2) #連接數(shù)據(jù)庫并執(zhí)行這兩條語句。 cursor.close() conn.close() #關(guān)閉數(shù)據(jù)庫。 sys.exit() |
上面是程序的代碼,采集的是:www.jokeswarehouse.com 的一個笑話網(wǎng)站。通過 python 的 re 模塊,也就是正則匹配模塊,運(yùn)行相應(yīng)的正則表達(dá)式,進(jìn)行過濾出我們所需要的標(biāo)題和文章內(nèi)容,再運(yùn)用 python 的mysqldb 模塊,進(jìn)行連接數(shù)據(jù)庫,利用相應(yīng)的插入語句,進(jìn)行插入數(shù)據(jù)庫。