如何用python爬取知乎话题吗? - 行业资讯 - 肥雀云

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用
 <代码> #编码:utf - 8
　　”“”
　　@author: haoning
　　@create时间:2015.8.5
　　”“”
　　从__future__进口部门#精确除法
　　从队列进口队列
　　从__builtin__导入错误
　　进口json
　　进口操作系统
　　进口再保险
　　进口平台
　　进口uuid
　　进口urllib
　　进口urllib2
　　导入系统
　　导入的时间
　　进口MySQLdb mdb
　　从bs4进口BeautifulSoup
　　
　　重载(系统)
　　sys。setdefaultencoding (“utf - 8”)
　　
　　头={
　　“用户代理”:“Mozilla/5.0 (X11;Ubuntu;Linux x86_64;房车:Firefox 35.0)壁虎/20100101/35.0”,
　　“内容类型”:“应用程序/x-www-form-urlencoded;charset=utf - 8 ',
　　“X-Requested-With”:“XMLHttpRequest”,
　　“推荐人”:“https://www.zhihu.com/topics”,
　　“饼干”:“__utma=51854390.517069884.1416212035.1416212035.1416212035.1;q_c1=c02bf44d00d240798bfabcfc95baeb56 | 1455778173000 | 1455778173000;_za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a;aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C;_xsrf=9 d494558f9271340ab24598d85b2a3c8;cap_id=" MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=| 1455864276 | 2 a4ce8247ebd3c0df5393bb5661713ad9eec01dd”;n_c=1;_alicdn_sec=56 c6ba4d556557d27a0f8c876f563d12a285f33a '
　　}
　　
　　DB_HOST=' 127.0.0.1 '
　　DB_USER=案?
　　DB_PASS=案?
　　
　　队列=队列()#接收队列
　　组节点集=()
　　keywordSet=组()
　　停止=0
　　抵消=-20
　　水平=0
　　maxLevel=7
　　计数器=0
　　基?" "
　　
　　康涅狄格州=mdb。连接(DB_HOST DB_USER DB_PASS,“知乎”,charset=use utf8)
　　conn.autocommit(假)
　　咕咕叫=conn.cursor ()
　　
　　def get_html (url):
　　试一试:
　　要求=urllib2.Request (url)
　　响应=urllib2.urlopen(要求没有3)#在这里应该加入代理
　　html=response.read ()
　　返回的html
　　除了:
　　通过
　　回来没有
　　
　　def getTopics ():
　　url=' https://www.zhihu.com/topics '
　　打印的url
　　试一试:
　　要求=urllib2.Request (url)
　　响应=urllib2.urlopen(点播)#鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞?
　　html=response.read () .decode (utf - 8)
　　打印html
　　汤=BeautifulSoup (html)
　　lis=汤。find_all(‘李’,{“类”:“zm-topic-cat-item”})
　　
　　李的lis):
　　data_id=li.get(数据id)
　　name=li.text
　　咕咕叫。执行(“选择id从classify_new名称=% s ',(名字))
　　y=curr.fetchone ()
　　如果不是y:
　　咕咕叫。执行(“插入classify_new (data_id、名称)值(% s % s), (data_id、名称))
　　conn.commit ()
　　除了例外e:
　　打印“得到主题错误”,e
　　
　　def get_extension(名称):=name.rfind (“。”)
　　如果在那里!=1:
　　返回名称(地点:len(名字))
　　回来没有
　　
　　def which_platform ():
　　sys_str=platform.system ()
　　返回sys_str
　　
　　def GetDateString ():
　　当=time.strftime (Y ' % - % - % d ', time.localtime (time.time ()))
　　foldername=str(时)
　　foldername返回
　　
　　def makeDateFolder (par分类):
　　试一试:
　　如果os.path.isdir (par):
　　newFolderName=par +‘//? GetDateString () +‘//? str(分类)
　　如果which_platform ()==癓inux”:
　　newFolderName=par +‘/? GetDateString () + " + str(分类)
　　如果不是os.path。isdir (newFolderName):
　　操作系统。makedirs (newFolderName)
　　返回newFolderName
　　其他:
　　回来没有
　　除了例外,e:
　　打印“乐”,e
　　回来没有
　　
　　def download_img (url,分类):
　　试一试:
　　延伸=get_extension (url)
　　如果(延伸没有):
　　回来没有
　　要求=urllib2.Request (url)
　　resp=urllib2.urlopen(没有要求,3)
　　dataimg=resp.read ()
　　name=str (uuid.uuid1 ()) .replace (“-”,“”) +“_www.guandn.com”+延伸=癊://topic_pic”
　　分类文件夹=makeDateFolder(上)
　　文件名=没有
　　如果文件夹不是没有:
　　文件名+ "//" +=文件夹的名字
　　试一试:
　　如果“e82bab09c_m”str (url):
　　还真
　　如果不是os.path.exists(文件名):
　　file_object=开放(文件名,“w + b”)
　　file_object.write (dataimg)
　　file_object.close ()
　　返回房间'//默认/' + GetDateString () + '/' + str(分类)+ " +名字
　　其他:
　　“打印”文件存在
　　回来没有
　　除了IOError,那么e1:
　　打印”e1=", e1
　　通过
　　除了例外e:
　　打印“eee”, e
　　通过
　　返回所有#如果没有下载下来就利用原来网站的链接
　　
　　def getChildren(节点名称):
　　全球队列,节点集
　　试一试:
　　url=" https://www.zhihu.com/topic/" + str(节点)+“/热”
　　html=get_html (url)
　　如果html没有:
　　返回
　　汤=BeautifulSoup (html)
　　p_ch='父话题”
　　node_name=汤。找到(" div " {“id”:“zh-topic-title”}); (h2)。text
　　topic_cla=汤。找到(" div "{“类”:“child-topic”})
　　如果topic_cla不是没有:
　　试一试:
　　p_ch=str (topic_cla.text)
　　船向一边倾斜的=汤。find_all (a{“类”:“zm-item-tag”}) #获取所有子节点
　　如果你的子话题”p_ch:
　　船向一边倾斜的为:
　　令牌=a.get(数据token)
　　a=str (a) .replace (' \ n ',”) .replace (‘\ t’,”) .replace (‘\ r’,”)
　　开始=str (a);(在)
　　结束=str (a) .rfind (' & lt;/a> ')
　　new_node=str (str (a)[开始+ 1:结束])
　　咕咕叫。执行(“选择id从房间名称=% s ', (new_node)) #先保证名字绝不相同
　　y=curr.fetchone ()
　　如果不是y:
　　打印" y=" y " new_node=", new_node,“令牌=",令牌
　　queue.put((令牌,new_node node_name))
　　除了例外e:
　　打印“添加队列错误”,e
　　除了例外e:
　　打印“得到html错误”,e
　　
　　def getContent (n,名字,p, top_id):
　　试一试:
　　全局计数器
　　咕咕叫。执行(“选择id从房间名称=% s ',(名字))#先保证名字绝不相同
　　y=curr.fetchone ()
　　打印”存在吗? ?”y“n=? n
　　如果不是y:
　　url=" https://www.zhihu.com/topic/" + str (n) +“/热”
　　html=get_html (url)
　　如果html没有:
　　返回
　　汤=BeautifulSoup (html)
　　title=汤。找到(" div " {“id”:“zh-topic-title”}); (h2)。text
　　pic_path=soup.find (a, {“id”:“zh-avartar-edit-form”}); (img) . get (“src”)
　　描述=soup.find (" div "{“类”:“zm-editable-content”})
　　如果描述不是没有:
　　描述=description.text
　　
　　如果(u“未归类”标题或u”根话题”标题):#允许入库,避免死循环
　　描述=没有
　　
　　tag_path=download_img (pic_path top_id)
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null
　　null如何用python爬取知乎话题吗?