用Python写的抓取天气预报程序
最近用java写网站有点累了,发发一些写于一年前的python代码,一直没有用在实际系统中。不知道针对现在的天气预报网站是不是有效,不过对各位应该又很大的参考价值.使用BeautifulSoup做HTML分析。
抓取最近的5天数据,并保存到mysql数据库中。
如果出现处理失败,会向指定的邮件地址,发送报警。这是一个比较完善的天气预报抓取程序。
<li class="alt"> #! /usr/bin/env python
<li class="alt"># -*- coding: utf-8 -*-
<li class="alt">""" todo :设置一个字段表示,是否成功更新,一旦成功则记录ID,系统运行结束,进行update。否则发送短信。保证只发送一次
<li class="alt">""" import os,urllib2,re,MySQLdb,datetime,time,smtplib
<li class="alt">from BeautifulSoup import BeautifulSoup from StringIO import StringIO
<li class="alt">from email.mime.text import MIMEText
<li class="alt">USER_AGENT = ' Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1' BASE_URL_BEGIN= 'http://www.weather.com.cn/html/weather/'
<li class="alt">BASE_URL_END = '.shtml' conn = MySQLdb.connect(host="localhost", user="fun", passwd="fun", db="fun",use_unicode=1, charset='utf8')
<li class="alt"> #Mail 接收方邮件
<li class="alt">mailto_list=["ealpha.shi@mobimtech.com"] #SMTP 服务器,用户名、口令以及邮箱的后缀
<li class="alt">mail_host="imichat.com" mail_user="imichat"
<li class="alt">mail_pass="imichat" mail_postfix="imichat.com"
<li class="alt"> #失败的WID,记录用来判断,是否发送邮件
<li class="alt">faultwid = [] #失败重试次数
<li class="alt">dotime = 0
<li class="alt">def send_mail(to_list,sub,content): '''''
<li class="alt"> to_list:发给谁 sub:主题
<li class="alt"> content:内容 send_mail("aaa@126.com","sub","content")
<li class="alt"> ''' me=mail_user+"<"+mail_user+"@"+mail_postfix+">"
<li class="alt"> msg = MIMEText(content) msg['Subject'] = sub
<li class="alt"> msg['From'] = me msg['To'] = ";".join(to_list)
<li class="alt"> try: s = smtplib.SMTP()
<li class="alt"> s.connect(mail_host) s.login(mail_user,mail_pass)
<li class="alt"> s.sendmail(me, to_list, msg.as_string()) s.close()
<li class="alt"> return True except Exception, e:
<li class="alt"> print str(e) return False
<li class="alt"> def getFiveDayWeather(wid,pageid,agent=USER_AGENT):
<li class="alt"> """ 将需要的数据表格从整个网页取出来
<li class="alt"> """ url = BASE_URL_BEGIN + pageid + BASE_URL_END
<li class="alt"> #print '*************************'
<li class="alt"> #print url
<li class="alt"> request = urllib2.Request(url) request.add_header('User-Agent', agent)
<li class="alt"> opener = urllib2.build_opener() allhtml = StringIO(str((opener.open(request)).read()))
<li class="alt"> soup = BeautifulSoup(allhtml, fromEncoding="utf-8")
<li class="alt"> html = soup.find('div', id='dd_0').parent.contents
<li class="alt"> getWeatherList(wid,html)
<li class="alt"> return html
<li class="alt">def getWeatherList(wid,html): """
<li class="alt"> 取得最后发布时间,已经5天数据 """
<li class="alt"> soup1 = BeautifulSoup(str(html)) time = soup1.find('h2')
<li class="alt"> update_time = '' for t in time:
<li class="alt"> update_time = t #print '\r'
<li class="alt"> #print update_time #print '\r'
<li class="alt"> html2 = soup1.findAll('div', { "class" : "fut_weatherbox" }) dayid = 0
<li class="alt"> for dayweather in html2: dayid += 1
<li class="alt"> getOneDayWeather(wid,dayid,update_time,dayweather) pass
<li class="alt"> def getOneDayWeather(wid,dayid,update_time,html):
<li class="alt"> """ 分析一天的天气预报信息
<li class="alt"> """ soup = BeautifulSoup(StringIO(str(html)), fromEncoding="UTF-8")
<li class="alt"> day = soup.findAll('h3') imgs = soup.findAll('img')
<li class="alt"> t00 = soup.findAll('h4', { "class" : "temp00_dn" }) t01 = soup.findAll('h4', { "class" : "temp01_dn" })
<li class="alt"> t02 = soup.findAll('h4', { "class" : "temp02_dn" }) t03 = soup.findAll('h4', { "class" : "temp03_dn" })
<li class="alt"> #print '----------------------' soup_h3 = BeautifulSoup(StringIO(str(day)), fromEncoding="UTF-8")
<li class="alt"> day_value = soup_h3.h3.renderContents()
<li class="alt"> #for img in imgs: 为了确定值,不使用循环 soup_img = BeautifulSoup(StringIO(str(imgs)), fromEncoding="UTF-8")
<li class="alt"> imgsrc = soup_img.first('img')['src'] d_pic_value = imgsrc.split('/')[-1].split('.')[-2]
<li class="alt"> soup_img = BeautifulSoup(StringIO(str(imgs)), fromEncoding="UTF-8")
<li class="alt"> imgsrc = soup_img.first('img')['src'] n_pic_value = imgsrc.split('/')[-1].split('.')[-2]
<li class="alt"> soup_t00 = BeautifulSoup(StringIO(str(t00)), fromEncoding="UTF-8")
<li class="alt"> weather_value = soup_t00.h4.renderContents()
<li class="alt"> soup_t01 = BeautifulSoup(StringIO(str(t01)), fromEncoding="UTF-8") max_temp = soup_t01.h4.renderContents()
<li class="alt"> soup_t02 = BeautifulSoup(StringIO(str(t02)), fromEncoding="UTF-8")
<li class="alt"> min_temp = soup_t02.h4.renderContents()
<li class="alt"> soup_t03 = BeautifulSoup(StringIO(str(t03)), fromEncoding="UTF-8") wind = soup_t03.h4.renderContents()
<li class="alt"> insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind )
<li class="alt"> def insertDB(wid,dayid,update_time,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind ):
<li class="alt"> """ 插入数据库,此处要修改,5天数据一次commit(),异常rollback()
<li class="alt"> """ cursor_uodate=conn.cursor()
<li class="alt"> sql="INSERT INTO weatherdetail( wid, dayid, lastupdate, currdate, dpic, npic,weather, maxtemp, mintemp, wind) VALUES( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" param = (wid,dayid,update_time ,day_value,d_pic_value,n_pic_value,weather_value,max_temp,min_temp,wind)
<li class="alt"> #print param n=cursor_uodate.execute(sql,param)
<li class="alt"> conn.commit() #print n
<li class="alt"> def sendMonitor():
<li class="alt"> """ 处理失败,发送报警邮件或短信
<li class="alt"> """ if len(faultwid) <> 0 :
<li class="alt"> if send_mail(mailto_list,"Error: Get Weather Error "+str(datetime.datetime.now()),str(faultwid)):
<li class="alt"> print "监控邮件发送成功." else:
<li class="alt"> print "监控邮件发送失败." pass
<li class="alt"> def doworking(dotime,wid,pageid):
<li class="alt"> """ 业务处理入口
<li class="alt"> """ try:
<li class="alt"> getFiveDayWeather(wid,pageid) except (NameError,Exception),e:
<li class="alt"> print "has one error on %s %s , then do it again , waiting five secs." % (wid,pageid) time.sleep(5)
<li class="alt"> if dotime < 3 : doworking(dotime + 1,wid,pageid)
<li class="alt"> else: faultwid.append(wid)
<li class="alt"> pass
<li class="alt">if __name__ == "__main__": """
<li class="alt"> 入口函数 """
<li class="alt"> starttime = datetime.datetime.now()
<li class="alt"> print "Start."+str(starttime)
<li class="alt"> cursor = conn.cursor() cursor.execute("SELECT id,weather_com_cn_pageid FROM weather")
<li class="alt"> result = cursor.fetchall() for record in result:
<li class="alt"> # 将 dotime 恢复到 0 ,代表本次请求首次执行 doworking(0,str(record),record)
<li class="alt"> #time.sleep(2)
<li class="alt"> print '\r' endtime = datetime.datetime.now()
<li class="alt"> print "End."+str(endtime) print "-------------------------------------------------"
<li class="alt"> sendMonitor() print (endtime - starttime).seconde
页:
[1]