python 解析url-Html/Css-WEB前端-六狼论坛-IT论坛-计算机论坛

canofy 发表于 2013-2-7 19:21:37

python 解析url

摘录了dive into python的例子
有两种方法，HTMLParser和SGMLParser
第一种：
#-*-coding:utf-8-*-importHTMLParser#html解析，继承HTMLParser类class MyHTMLParser(HTMLParser.HTMLParser): def _init(self):    HTMLParser.HTMLParser.__init__(self);    # 处理开始标签和结束标签 -- finish processing of start+end tag: <tag.../> def handle_startendtag(self, tag, attrs):    self.handle_starttag(tag, attrs)    self.handle_endtag(tag) #handle start tag #处理开始标签和结束标签这里打印出a标签的href的属性值 def handle_starttag(self,tag, attrs):    if tag=='a':          for name,value in attrs:             ifname=='href':                   printvalue    # 处理结束标签，比如</xx> -- handle end tag def handle_endtag(self,tag):    pass; # 处理特殊字符串，就是以&#开头的，一般是内码表示的字符 -- handle character reference def handle_charref(self, name):    pass # 处理一些特殊字符，以&开头的，比如 &nbsp; -- handle entity reference def handle_entityref(self, name):    pass # 处理数据，就是<xx>data</xx>中间的那些数据 -- handle data def handle_data(self, data):    pass # 处理注释 -- handle comment def handle_comment(self, data):    pass # 处理<!开头的，比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" -- handle declaration def handle_decl(self, decl):    pass # 处理形如<?instruction>的东西 -- handle processing instruction def handle_pi(self, data):    pass    a='<body><a href="www.163.com">test</a></body>' print amy=MyHTMLParser()my.feed(a)#结果为www.163.com

第二种方式：
首先是一个基础类，和上面的方式一样
#!/usr/bin/env python#-*-coding:utf-8-*-from sgmllib import SGMLParserimport htmlentitydefsclass BaseHTMLProcessor(SGMLParser): def reset(self):                            # extend (called by SGMLParser.__init__)    self.pieces = []    SGMLParser.reset(self)    #是一个开始一个块的 HTML 标记，象 <html>，<head>，<body> 或 <pre> 等，或是一个独一的标记， #象 <br> 或 <img> 等。当它找到一个开始标记 tagname，SGMLParser 将查找名为 start_tagname #或 do_tagname 的方法。例如，当它找到一个 <pre> 标记，它将查找一个 start_pre 或 do_pre 的方法。 #如果找到了，SGMLParser 会使用这个标记的属性列表来调用这个方法；否则，它用这个标记的名字和属性 #列表来调用 unknown_starttag 方法。 def unknown_starttag(self, tag, attrs):    # called for each start tag    # attrs is a list of (attr, value) tuples    # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]    # Ideally we would like to reconstruct original tag and attributes, but    # we may end up quoting attribute values that weren't quoted in the source    # document, or we may change the type of quotes around the attribute value    # (single to double quotes).    # Note that improperly embedded non-HTML code (like client-side Javascript)    # may be parsed incorrectly by the ancestor, causing runtime script errors.    # All non-HTML code must be enclosed in HTML comment tags ()    # to ensure that it will pass through this parser unaltered (in handle_comment).    strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])    self.pieces.append("<%(tag)s%(strattrs)s>" % locals())    #是结束一个块的 HTML 标记，象 </html>，</head>，</body> 或 </pre> 等。 #当找到一个结束标记时，SGMLParser 将查找名为 end_tagname 的方法。如果找到， #SGMLParser 调用这个方法，否则它使用标记的名字来调用 unknown_endtag 。 def unknown_endtag(self, tag):             # called for each end tag, e.g. for </pre>, tag will be "pre"    # Reconstruct the original end tag.    self.pieces.append("</%(tag)s>" % locals())          #用字符的十进制或等同的十六进制来表示的转义字符，象。当 #找到，SGMLParser 使用十进制或等同的十六进制字符文本来调用 handle_charref 。 def handle_charref(self, ref):             # called for each character reference, e.g. for " ", ref will be "160"    # Reconstruct the original character reference.    self.pieces.append("&#%(ref)s;" % locals()) #HTML 实体，象 &copy;。当找到，SGMLParser 使用 HTML 实体的名字来调用 handle_entityref 。 def handle_entityref(self, ref):             # called for each entity reference, e.g. for "&copy;", ref will be "copy"    # Reconstruct the original entity reference.    self.pieces.append("&%(ref)s" % locals())    # standard HTML entities are closed with a semicolon; other entities are not    if htmlentitydefs.entitydefs.has_key(ref):          self.pieces.append(";") #文本块。不满足其它 7 种类别的任何东西。当找到，SGMLParser 用文本来调用 handle_data。 def handle_data(self, text):                # called for each block of plain text, i.e. outside of any tag and    # not containing any character or entity references    # Store the original text verbatim.    #数据的处理    self.pieces.append(text)          #HTML 注释, 包括在 之间。当找到，SGMLParser 用注释内容来调用 handle_comment def handle_comment(self, text):             # called for each HTML comment, e.g.     # Reconstruct the original comment.    # It is especially important that the source document enclose client-side    # code (like Javascript) within comments so it can pass through this    # processor undisturbed; see comments in unknown_starttag for details.    self.pieces.append("" % locals()) #HTML 处理指令，包括在 <? ... > 之间。当找到，SGMLParser 用处理指令内容来调用 handle_pi。 def handle_pi(self, text):                   # called for each processing instruction, e.g. <?instruction>    # Reconstruct original processing instruction.    self.pieces.append("<?%(text)s>" % locals()) #HTML 声明，如 DOCTYPE，包括在 <! ... >之间。当找到，SGMLParser 用声明内容来调用 handle_decl def handle_decl(self, text):    # called for the DOCTYPE, if present, e.g.    # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"    # "http://www.w3.org/TR/html4/loose.dtd">    # Reconstruct original DOCTYPE    self.pieces.append("<!%(text)s>" % locals()) def output(self):                   """Return processed HTML as a single string"""    return "".join(self.pieces)
接着第二种方法具体的应用，解析的是新浪一个特定blog的文章的内容和标题代码如下：
#!/usr/bin/env python#coding:utf8import refrom BaseHTMLProcessor import BaseHTMLProcessorimport urllibclass Dialectizer(BaseHTMLProcessor): subs = () def reset(self):    # extend (called from __init__ in ancestor)    # Reset all data attributes    self.verbatim = 0    BaseHTMLProcessor.reset(self) def unknown_starttag(self, tag, attrs):    self.pieces.append("")          def unknown_endtag(self, tag):    self.pieces.append("")          def start_title(self, attrs):    self.pieces.append("title")       def end_title(self):       self.pieces.append("title")          def start_p(self, attrs):    self.pieces.append("\n")       def end_p(self):       self.pieces.append("")          def start_div(self, attrs):    strattrs = "".join()    self.pieces.append(strattrs)                   def end_div(self):       self.pieces.append("div")       def handle_data(self, text):    self.pieces.append(self.verbatim and text or self.process(text)) def process(self, text):    for fromPattern, toPattern in self.subs:          text = re.sub(fromPattern, toPattern, text)    return textdef translate(url):    import urllib                      sock = urllib.urlopen(url)          htmlSource = sock.read()             sock.close()                      parser = Dialectizer() #parser.subs=((r"本",r"aaa"),) parser.feed(htmlSource)#进行解析 parser.close()          return parser.output() def test(url,filename): htmlSource=translate(url) #标题 title=htmlSource title=title[:re.search("title",title).end()-5] #内容 content=htmlSource content=content[:re.search("div",content).end()-3] content=re.sub("&nbsp;","",content) content=re.sub("nbsp;","",content) #文件名称 fileName=title; #输出的文件内容 fileContent=title+"\n\n\n"+content;    fsock = open(filename, "wb") fsock.write(fileContent) fsock.close()if __name__ == "__main__": test("http://blog.sina.com.cn/s/blog_4bd7b9a20100cpgb.html",'test.txt')

页: [1]

六狼论坛's Archiver

python 解析url