python爬虫案例之如何获取招聘要求 - 行业资讯 - 肥雀云

　　介绍

小编给大家分享一python爬下虫案例之如何获取招聘要求,相信大部分人都还不怎么了解,因此分享这篇文章给大家参考一下,希望大家阅读完这篇文章后大有收获、下面让我们一起去了解一下吧!

pid pid拼接网址=比;得到detail_url,使用请求。,防止爬虫挂掉,一旦发现爬取的细节重复,就重新启动爬虫

detail_url获取网html页信息=比;请求——比;html,使用BeautifulSoup

if html.status_code !=200 　　打印(& # 39;status_code if {} & # 39; .format (html.status_code))

汤中获取特定元素内容=比;岗位信息

MongoDB中

#,@author:, limingxuan 　　#,@contect: limx2011@hotmail.com 　　#,@blog: https://www.jianshu.com/p/a5907362ba72 　　#,@time: 2018-07-21 　　, 　　import 请求　　得到bs4 import BeautifulSoup 　　import 时间　　得到pymongo import MongoClient 　　, 　　headers =, {,,, 　　,,,& # 39;接受# 39;:,“application/json, text/javascript, */*;, q=0.01“, 　　,,,& # 39;accept-encoding& # 39;:,“gzip,,, br"缩小, 　　,,,& # 39;接收语言,:,“,应用zh型;q=0.9, en; q=0.8“, 　　,,,& # 39;内容类型# 39;:,“应用程序/x-www-form-urlencoded; charset=UTF-8", 　　,,,& # 39;饼干# 39;:,“JSESSIONID=啊?估计=1530137184;,sid=sem_pz_bdpc_dasou_title;, __g=sem_pz_bdpc_dasou_title;, __l 3=r=https % % 2 f % 2 fwww.zhipin.com % 2 fgongsi % 2 f5189f3fadb73e42f1hn40t8 ~ .html& l=% 2 fwww.zhipin.com % 2 fgongsir % 2 f5189f3fadb73e42f1hn40t8 ~ . html % 3 fka % 3 dcompany-jobs& g=% 2 fwww.zhipin.com % 2 f % 3 fsid % 3 dsem_pz_bdpc_dasou_title;, Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1531150234, 1531231870, 1531573701, 1531741316,, lastCity=101010100;, toUrl 3=https % % 2 f % 2 fwww.zhipin.com % 2 fjob_detail % 2 f % 3 fquery % 3 dpython % 26 scity % 3 d101010100;, Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1531743361;,亲自=26651524.1530136298.1530136298.1530137184.286.2.285.199" 　　,,,& # 39;起源# 39;:,“https://www.zhipin.com" 　　,,,& # 39;推荐人# 39;:,“https://www.zhipin.com/job_detail/?query=python& scity=101010100”; 　　,,,& # 39;用户代理# 39;:,“Mozilla/5.0, (Macintosh;, Intel Mac OS X 10 _13_5), AppleWebKit/537.36, (KHTML, like 壁虎),Chrome/67.0.3396.99 Safari/537.36“ 　　,,,} 　　, 　　时间=conn MongoClient (& # 39; 127.0.0.1 # 39;, 27017) 　　db =conn.zhipin_jobs 　　, 　　def init (): 　　,,,items =, db.Python_jobs.find () .sort (& # 39; pid # 39;) 　　,,,for item 拷贝项目: 　　,,,,,,,if & # 39; detial& # 39;,拷贝item.keys():, #当爬虫挂掉时,跳过已爬取的页　　,,,,,,,,,,,继续　　,,,,,,,detail_url =, & # 39; https://www.zhipin.com/job_detail/{} . html # 39; .format(项目[& # 39;pid # 39;]), #单引号和双引号相同,str.format()新格式化方式　　,,,,,,,#第一阶段顺利打印出岗位页面的url 　　,,,,,,,印刷(detail_url) 　　,,,,,,,#返回的html是,Response 类的结果　　,,,,,,,html =, requests.get (detail_url headers =,头) 　　,,,,,,,if html.status_code !=, 200: 　　,,,,,,,,,,,印刷(& # 39;status_code is {} & # 39; .format (html.status_code)) 　　,,,,,,,,,,,休息　　,,,,,,,#返回值汤表示一个文档的全部内容(html.praser是html解析器) 　　,,,,,,,soup =, BeautifulSoup (html.text & # 39; html.parser& # 39;) 　　,,,,,,,job =, soup.select (& # 39; .job-sec 、。text # 39;) 　　,,,,,,,印刷(工作) 　　,,,,,,,# ? ? ? 　　,,,,,,,if len(工作)& lt; 1: 　　,,,,,,,,,,,继续　　,,,,,,,项目[& # 39;细节# 39;],=,工作[0].text.strip(), #职位描述　　,,,,,,,的位置=,soup.select (“.job-sec .job-location .location-address"), 　　,,,,,,,项目[& # 39;位置# 39;],=,位置[0].text.strip(), #工作地点　　,,,,,,,项目[& # 39;updated_at& # 39;],=, time.strftime (“Y % - % - % d % H: % m: % S", time.localtime()), #实时爬取时间　　,,,,,,,#打印(项目[& # 39;细节# 39;]) 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null