怎么在Python中使用beautifulSoup4爬取数据 - 行业资讯 - 肥雀云

　　介绍

怎么在Python中使用beautifulSoup4爬取数据?很多新手对此不是很清楚,为了帮助大家解决这个难题,下面小编将为大家详细讲解,有这方面需求的人可以来学习下,希望你能有所收获。

具体如下:

# !/usr/bin/python3 　　#,- *安康;编码:utf-8 - * 　　得到urllib.request import urlopen as 开放　　得到bs4 import BeautifulSoup 　　import 再保险　　import pymysql 　　def find_top_ten (url): 　　response =,开放才能(url) 　　bs 才能=,BeautifulSoup(响应,& # 39;html.parser& # 39;) 　　tags 才能=,bs.select (& # 39; span.tag-item 一个# 39;) 　　top_ten_href 才能=,[tag.get (& # 39; href # 39;), for tag 拷贝标签) 　　top_ten_tag 才能=,[tag.text for tag 拷贝标签) 　　#,才能打印(top_ten_href) 　　#,才能打印(top_ten_tag) 　　return top_ten_href才能　　def insert_into_mysql(记录): 　　con 才能=,pymysql.connect(主机=& # 39;localhost # 39;,用户=& # 39;根# 39;,密码=& # 39;根# 39;,数据库=& # 39;引用# 39;,charset=& # 39; use utf8 # 39;,端口=3306) 　　时间=cursor 才能;con.cursor () 　　sql =,才能“insert into 引用(内容、作者、标签),值(% s, % s % s)“; 　　for 才能;record 拷贝记录: 　　,,,cursor.execute (sql,,记录) 　　con.commit才能() 　　cursor.close才能() 　　con.close才能() 　　#,http://quotes.toscrape.com/tag/love/#要获取对应标签中所有的名言,所以这里要考虑分页的情况　　#经过在网页上查看知道分页查询的url 　　# http://quotes.toscrape.com/tag/love/page/1/#判断到那一页没有数据,div.container div.row [1] 　　def find_link_content(链接): 　　page 才能=1 　　while 才能正确的: 　　,,,new_link =,“http://quotes.toscrape.com", +, link +,“页面/? 　　,,,#,打印(new_link) 　　,,,new_link =, new_link +, str(页面) 　　,,,印刷(new_link) 　　,,,sub_bs =,开放(new_link) 　　,,,sub_bs =, BeautifulSoup (sub_bs & # 39; html.parser& # 39;) 　　,,,quotes =, sub_bs.select (& # 39; div.row div.col-md-8 span.text& # 39;) 　　,,,#,如果没有数据就退出　　,,,if len(引用),==,0: 　　,,,,,休息　　,,,#名言　　,,,quotes =, (quote.text.strip (& # 39; " & # 39;), for quote 拷贝引用) 　　,,,#作者　　,,,authors =, sub_bs.select (& # 39; small.author& # 39;) 　　,,,authors =, (author.text for author 拷贝作家) 　　,,,#,标签　　,,,tags_list =, sub_bs.select (& # 39; meta.keywords& # 39;) 　　,,,tags_list =, (tags.get(& # 39;内容# 39;),for tags 拷贝tags_list] 　　,,,#,打印(作者) 　　,,,#,打印(引用) 　　,,,#打印(tags_list) 　　,,,record_list =, [] 　　,,,for 小姐:拷贝范围(len(报价): 　　,,,,,tags =, tags_list[我] 　　,,,,,tags =, tags.replace (& # 39; & # 39; & # 39;, & # 39;) 　　,,,,,印刷(标签) 　　,,,,,record =,(引用(我),作者(我),标签) 　　,,,,,record_list.append(记录) 　　,,,insert_into_mysql (record_list) 　　,,,page +=1 　　# 　　def main (): 　　url =,才能“http://quotes.toscrape.com/" 　　时间=parent_link 才能;find_top_ten (url) 　　for 才能;link parent_link:拷贝　　,,,print(链接) 　　,,,find_link_content(链接) 　　if __name__ ==, & # 39; __main__ # 39;: 　　以前,,main ()