使用python怎么将爬取的网页内容转换为PDF文件 - 行业资讯

　　介绍

本文章向大家介绍使用python怎么将爬取的网页内容转换为PDF文件的基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

python主要用来做什么

python主要应用于:1,网络开发;2、数据科学研究;3,网络爬虫;4、嵌入式应用开发,5日游戏开发;6桌面应用开发。

#,编码=utf-8 　　import os 　　import re 　　import time 　　import pdfkit 　　import requests 　　得到bs4 import BeautifulSoup 　　得到PyPDF2 import PdfFileMerger 　　import 系统　　重载(系统) 　　sys.setdefaultencoding (& # 39; use utf8 # 39;) 　　　　时间=html_template “““, 　　& lt; ! DOCTYPE html>, 　　& lt; html lang=癳n"祝辞, 　　& lt; head>, 　　& lt;才能meta charset=癠TF-8"祝辞, 　　时间/head> & lt; 　　& lt; body>, 　　{内容}, 　　时间/body> & lt; 　　时间/html> & lt; 　　　　“““, 　　　　# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 　　def parse_url_to_html (url,,名字):, 　　“““才能, 　　解才能析URL,返回HTML内容, 　　,,:param url:解析的url 　　,,:param 名称:,保存的html文件名, 　　,,:返回:html 　　“““才能, 　　,,试一试: 　　,,,response =, requests.get (url), 　　,,,soup =, BeautifulSoup (response.content, & # 39; html.parser& # 39;), 　　,,,#,正文, 　　,,,body =, soup.find_all (class_=皒-wiki-content") [0], 　　,,,#,标题, 　　,,,title =, soup.find (& # 39; h5 # 39;) .get_text (), 　　　　,,,#,标题加入到正文的最前面,居中显示, 　　,,,center_tag =, soup.new_tag (“center"), 　　,,,title_tag =, soup.new_tag (& # 39; h2 # 39;), 　　,,,title_tag.string =, title 　　,,,center_tag.insert (1, title_tag), 　　,,,body.insert (1, center_tag), 　　,,,html =, str(身体), 　　,,,#,身体中的img标签的src相对路径的改成绝对路径, 　　,,,pattern =,“(& lt; img 。* ? src=https://www.yisu.com/? (. * ?) (\“)” 　　　　def func(米): 　　如果不是m.group (3) .startswith (“http”): 　　研制=m.group (1) +“http://www.liaoxuefeng.com”+ m.group m.group (2) + (3) 　　返回研制　　其他: 　　返回m.group (1) + m.group m.group (2) + (3) 　　html=re.compile(模式).sub(函数、html) 　　html=html_template.format(内容=html) 　　html=html.encode (“utf - 8”) 　　张开(名称,“世界银行”)f: 　　f.write (html) 　　返回的名字　　　　除了例外e: 　　打印”解析错误!” 　　　　# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 　　def get_url_list (): 　　”“” 　　获取所有URL目录列表　　返回: 　　”“” 　　响应=requests.get (“http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000”) 　　汤=BeautifulSoup(响应。内容,“html.parser”) 　　menu_tag=汤。find_all (class_=" uk-nav uk-nav-side”) [1] 　　url=[] 　　在menu_tag.find_all李(李): 　　url=" http://www.liaoxuefeng.com " + li.a.get (“href”) 　　urls.append (url) 　　返回的url 　　　　# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 　　file_name def save_pdf (html): 　　”“” 　　把所有html文件保存到pdf文件　　:param html: html文件列表　　:param file_name: pdf文件名　　返回: 　　”“” 　　选择={ 　　“页面大小”:“信”, 　　“margin-top”:“0.75”, 　　“margin-right”:“0.75”, 　　“margin-bottom”:“0.75”, 　　“margin-left”:“0.75”, 　　“编码”:“utf - 8”, 　　“自订标头”:( 　　(“接受编码”、“gzip”) 　　), 　　“饼干”:( 　　(' cookie-name1 ', ' cookie-value1 '), 　　(' cookie-name2 ', ' cookie-value2 '), 　　), 　　“outline-depth”: 10 　　} 　　pdfkit.from_file (html、file_name选项=选项) 　　　　# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 　　def main (): 　　开始=time.time () 　　file_name=u“liaoxuefeng_Python3_tutorial” 　　url=get_url_list () 　　在列举为指数,url(网址): 　　parse_url_to_html (url, str(指数)+ " . html ") 　　html=[] 　　pdf=[] 　　因为我在范围(0124): 　　htmls.append (str (i) + . html) 　　pdfs.append (file_name + str (i) + . pdf) 　　　　save_pdf (str (i) +”。html, file_name + str (i) + . pdf”) 　　　　打印u”转换完成第" + str(我)+个html的　　　　合并=PdfFileMerger () 　　pdf格式的pdf文档: 　　merger.append(打开(pdf, rb)) 　　打印u”合并完成第" + str (i) +个pdf的pdf 　　　　输出=开放(u”廖雪峰Python_all。pdf”、“白平衡”) 　　merger.write(输出) 　　　　打印u”输出PDF成功!” 　　　　在html文件为html: 　　os.remove (html) 　　打印u”删除临时文件”+ html 　　　　pdf格式的pdf文档: 　　os.remove (pdf) 　　打印u”删除临时文件”+ pdf 　　　　total_time=time.time()——开始　　打印(u”总共耗时:% f秒”% total_time) 　　　　# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 　　def changeDir (dir_name): 　　”“” 　　目录切换　　”“” 　　如果不是os.path.exists (dir_name): 　　os.mkdir (dir_name) 　　　　os.chdir (dir_name) 　　# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 　　if __name__==癬_main__”: 　　#存放文件的路径　　dir_name='/home/Python/Html ' 　　changeDir (dir_name) 　　main ()