前几天看到一个百度dota吧的欢乐直播贴,要说在baidu贴吧看直播贴实在是十分蛋疼的一件事情,连个只看楼主的功能都没有(比dzh神马的差远了),于是只能自己动手了。
#coding: utf-8 """ 百度直播贴提取脚本 @author:piglei2007@gmail.com @version:1.0 """ import urllib import urlparse from BeautifulSoup import BeautifulSoup class BaiduZhibo(object): """ 可迭代的对象,不停返回下一页含有给定username的内容(默认为楼主) 返回格式: 字典:{ "url": "..." #当前链接地址 "page": 5 #当前页数 "content": [...] #列表,里面有当前页每一个指定username的发言内容 } 参数: url: 帖子地址 obj_name: 需要抓取的用户昵称,默认为楼主 limit: 限定抓取页面的数量,默认无限制 html: 设定输出格式,True不做处理,False替换换行符、空格 """ def __init__(self, url, obj_name=None, limit=0, html=False): self.starturl = url self.obj_name = obj_name self.limit = limit self.counter = 0 self.html = html self.nowurl = url self.page = 1 def next(self): if (self.limit and self.counter == self.limit) or (self.nowurl is None): print "finished." raise StopIteration result = {} result["url"] = self.nowurl result["page"] = self.page result["content"] = [] content = urllib.urlopen(self.nowurl).read().decode("gbk", "ignore") soup = BeautifulSoup(content) posts = soup.findAll("div", {"class": "post"}) for post in posts: username = self.get_username(post) if not self.obj_name: self.obj_name = username if username == self.obj_name: floor, cc = self.parse_post(post) if not self.html: cc = self.replace_htmltag(cc) result["content"].append({ "floor": floor, "content": cc, }) self.nowurl = self.get_nexturl(soup) self.counter += 1 self.page += 1 return result def __iter__(self): return self def get_username(self, post): """ 获得用户姓名 """ return post.find("td").get("username") def parse_post(self, post): """ 返回楼数和内容 """ _content = post.find('td', {"class": "content"}) floor = _content.find('p', {"class": "floor"}).renderContents() cc = _content.find('cc').renderContents() return floor, cc def get_nexturl(self, soup): """ 从页面获得下一页url """ next = soup.find(text=u"下一页") if next: return urlparse.urljoin(self.starturl, next.parent.get("href")).encode("utf-8") return None @staticmethod def replace_htmltag(content): content = content.replace("<br />", "\n") content = content.replace(" ", " ") return content objfile = open("gua_hacker.html", "w") if __name__ == '__main__': bz = BaiduZhibo("http://tieba.baidu.com/f?kz=923968151", html=True) objfile.write(''' <html> <head> <meta http-equiv="content-Type" content="text/html; charset=utf-8" /> <style type="text/css"> * {font-size: 14px;margin: 0;padding: 0} .content_top { padding: 10px 4px; font-weight: bold; } .content_main { padding: 10px 4px; border-bottom: 1px solid #e0e0e0; } .floor {color: gray} </style> </head> <body> <div style="width: 950px;margin: auto"> ''') for x in bz: if not x["content"]: continue print x["page"], x["url"] objfile.write(''' <div class="content_top"> 第 %(page)s 页:<a href="%(url)s" target="_blank">%(url)s</a> </div> ''' % x) for each in x["content"]: objfile.write(''' <div class="content_main"> <span class="floor">%(floor)s</span>:%(content)s </div> ''' % each) objfile.write(''' </div> </body> </html> ''')
这个便是脚本的效果页面了:http://piglei.com/static/uploaded/2010/11/gua_hacker.html
用到了一个解析html/xml的包,现学现写的,挺好用,地址在这:http://www.crummy.com/software/BeautifulSoup/
😊 如果你喜欢这篇文章,也欢迎了解我的书: 《Python 工匠:案例、技巧与工程实践》 。它专注于编程基础素养与 Python 高级技巧的结合,是一本广受好评、适合许多人的 Python 进阶书。