日本黄色激情视频,久久久久久久久91,欧美一级免费看

在學(xué)習(xí)python的過程中，經(jīng)過不斷的嘗試及努力，終于完成了第一個像樣的python程序，雖然還有很多需要優(yōu)化的地方，但是目前基本上實(shí)現(xiàn)了我所要求的功能，先貼一下程序代碼：

用python寫的一個wordpress的采集程序

具體代碼如下:

				?

									#! /usr/bin/python

									 import os,urllib2,re,time,MySQLdb,sys

									 reTitle          = re.compile('<font[^>]*>(.*?)<\/font><font[^>]*')

									 reNeiron         = re.compile('[1-9|A-Z|a-z].*')

									 retiqu          = re.compile('^(?!MARGINWIDTH|BR).*.[^>|}]$')

									 rezhong          = re.compile('^[^[].*')

									 shijian=1190944000

									 Str1="\\n---------------- BLOG OF YAO"

									 bianhao=2859

									 for i in range(1,1500):

									     Str2=""

									     ltime=time.localtime(shijian)

									     timeStr=time.strftime("%Y%m%d",ltime)

									     url="http://www.jokeswarehouse.com/cgi-bin/viewjoke2.cgi?id=%s" %timeStr

									     print url

									     a=urllib2.urlopen(url).read()

									     Title=reTitle.findall(a)

									     print "=========================================================================================================="

									     for titles in map(None,Title):

									         titles=MySQLdb.escape_string(titles)

									         print titles

									     Neiron=re.findall(reNeiron,a)

									     for i in map(None,Neiron):

									         x=re.findall(retiqu,i)

									         for str in x:

									             str=MySQLdb.escape_string(str)

									             Str2 += str+"\\n"

									     shijian += 86400

									     bianhao += 1

									     try:

									         conn=MySQLdb.connect("XXXX.XXXX.XXXX.XXXX","user","passwd","dbname",charset="utf8", init_command="set names utf8")

									     except MySQLdb.OperationalError,message:

									         print "like error"

									     cursor=conn.cursor()

									     sql="INSERT INTO wp_posts (post_author,post_date,post_date_gmt,post_content,post_content_filtered,post_title,post_excerpt,post_status,post_type,comment_status,ping_status,post_password,post_name,to_ping,pinged,post_modified,post_modified_gmt,post_parent,menu_order,guid) VALUES (\'1\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'\',\'\',\'Auto Draft\',\'\',\'inherit\',\'revision\',\'open\',\'open\',\'\',\'100-revision\',\'\',\'\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'%s\',\'0\',\'\')" %bianhao

									     sql2="UPDATE wp_posts SET post_author = 1, post_date = \'2011-06-01 22:12:25\', post_date_gmt = \'2011-06-01 22:12:25\', post_content =\'%s\', post_content_filtered = \'\', post_title = \'%s\', post_excerpt = \'\', post_status = \'publish\', post_type = \'post\', comment_status = \'open\', ping_status = \'open\', post_password = \'\', post_name = \'%s\', to_ping = \'\', pinged = \'\', post_modified = \'2011-06-01 22:12:25\', post_modified_gmt = \'2011-05-09 04:12:30\', post_parent = 0, menu_order = 0, guid = \'http://www.moncleronlineshops.com/?p=%s\' WHERE ID = %s" %(Str2,titles,titles,bianhao,bianhao)

									     cursor.execute(sql)

									     cursor.execute(sql2)

									     cursor.close()

									     conn.close()

									     sys.exit()

下面，我們來給代碼加些注釋，讓讀者能看的更明白一些，如下：

具體代碼如下

				?

									#! /usr/bin/python

									 import os,urllib2,re,time,MySQLdb,sys #加載本程序需要調(diào)用的相模塊

									reTitle          = re.compile('<font[^>]*>(.*?)<\/font> <font[^>]*') # 定義一下取文章標(biāo)題的正則

									reNeiron         = re.compile('[1-9|A-Z|a-z].*') 

									 #定義一個取提取文章內(nèi)容的正則(注：這里提取出來的不是很精細(xì)，需要在下面的正則里，再進(jìn)行提取，這里只是取一個大概)

									retiqu          = re.compile('^(?!MARGINWIDTH|BR).*.[^>|}]$')

									 #這里定義一個正則，將上面reNeiron提取出來的字符，再進(jìn)行細(xì)化。

									shijian=1190944000  #這里字義了一個時間戳，

									Str1="\\n---------------- BLOG OF YAO" #這個沒用，開始是準(zhǔn)備加到文章里的，后來沒加進(jìn)去。

									bianhao=2859   #這里是wordpress 的文章編號，直接查看wp-posts表的id 字段的最后一個數(shù)字。

									for i in range(1,1500): #循環(huán)1500遍，也就是采集1500篇文章。

									    Str2="" #先賦值給Str2 空值

									    ltime=time.localtime(shijian)  

									     timeStr=time.strftime("%Y%m%d",ltime) #這兩句將上面的時間戳改為時間，樣式為19700101這樣的格式

									    url="http://www.jokeswarehouse.com/cgi-bin/viewjoke2.cgi?id=%s" %timeStr #定義要采集的網(wǎng)站，將轉(zhuǎn)化后的時間放在這個url的最后。

									    a=urllib2.urlopen(url).read() #將這個網(wǎng)頁的源代碼讀出來，賦值給a;

									     Title=reTitle.findall(a)

									 #使用 reTitle這個正則提取出標(biāo)題

									    print "=========================================================================================================="

									     for titles in map(None,Title): #上面提取出來的標(biāo)題前后都有一個 [] 

									所以我們要寫個for循環(huán)把前后的[]去掉，并轉(zhuǎn)義成能直接插入mysql庫的格式。

									        titles=MySQLdb.escape_string(titles)

									     Neiron=re.findall(reNeiron,a) #先用reNeiron，取個大概的內(nèi)容模型出來。這些都是以逗號分隔的數(shù)組。

									    for i in map(None,Neiron): # 我們來循環(huán)讀出Neiron這個數(shù)組里的每個值。

									        x=re.findall(retiqu,i)#并用 retiqu這個正則提出精細(xì)出的內(nèi)容。

									        for str in x:

									             str=MySQLdb.escape_string(str)

									             Str2 += str+"\\n"

									 #利用這個循環(huán)，我們把內(nèi)容加到一起，并賦值給Str2這個變量，這個　Str2這個變量就是所有的文章內(nèi)容。

									    shijian += 86400 #每循環(huán)一次，就把shijian這個變量加上一天。

									    bianhao += 1   #每循環(huán)一次，就把bianhao這個變量加上一

									    try:

									 #下面是用mysqldb連接數(shù)據(jù)庫，并嘗試連接是否成功。       conn=MySQLdb.connect("XXXX.XXXX.XXXX.XXXX","user","passwd","dbname",charset="utf8", init_command="set names utf8")

									     except MySQLdb.OperationalError,message:

									         print "like error"

									     cursor=conn.cursor()

									 #下面是插入wordpress數(shù)據(jù)庫的兩條語句，我是從mysqlbinlog里面導(dǎo)出來的，測試是可以插入數(shù)據(jù)庫，并能正常把內(nèi)容顯示在網(wǎng)頁的。變量都寫在這兩條語句里。

									    sql="INSERT INTO wp_posts (post_author,post_date,post_date_gmt,post_content,post_content_filtered,post_title,post_excerpt,post_status,post_type,comment_status,ping_status,post_password,post_name,to_ping,pinged,post_modified,post_modified_gmt,post_parent,menu_order,guid) VALUES (\'1\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'\',\'\',\'Auto Draft\',\'\',\'inherit\',\'revision\',\'open\',\'open\',\'\',\'100-revision\',\'\',\'\',\'2011-06-01 22:12:25\',\'2011-05-09 04:12:25\',\'%s\',\'0\',\'\')" %bianhao

									     sql2="UPDATE wp_posts SET post_author = 1, post_date = \'2011-06-01 22:12:25\', post_date_gmt = \'2011-06-01 22:12:25\', post_content =\'%s\', post_content_filtered = \'\', post_title = \'%s\', post_excerpt = \'\', post_status = \'publish\', post_type = \'post\', comment_status = \'open\', ping_status = \'open\', post_password = \'\', post_name = \'%s\', to_ping = \'\', pinged = \'\', post_modified = \'2011-06-01 22:12:25\', post_modified_gmt = \'2011-05-09 04:12:30\', post_parent = 0, menu_order = 0, guid = \'http://www.moncleronlineshops.com/?p=%s\' WHERE ID = %s" %(Str2,titles,titles,bianhao,bianhao)

									     cursor.execute(sql)

									     cursor.execute(sql2) #連接數(shù)據(jù)庫并執(zhí)行這兩條語句。

									    cursor.close()

									     conn.close()  #關(guān)閉數(shù)據(jù)庫。

									    sys.exit()

上面是程序的代碼，采集的是：www.jokeswarehouse.com 的一個笑話網(wǎng)站。通過 python 的 re 模塊，也就是正則匹配模塊，運(yùn)行相應(yīng)的正則表達(dá)式，進(jìn)行過濾出我們所需要的標(biāo)題和文章內(nèi)容，再運(yùn)用 python 的mysqldb 模塊，進(jìn)行連接數(shù)據(jù)庫，利用相應(yīng)的插入語句，進(jìn)行插入數(shù)據(jù)庫。