使用python怎么提取html文本 - 行业资讯 - 肥雀云

　　介绍

这期内容当中小编将会给大家带来有关使用python怎么提取html文本,文章内容丰富且以专业的角度为大家分析和叙述,阅读完这篇文章希望大家可以有所收获。

#,编码:utf - 8 　　　　得到time import 时间　　　　import warc 　　得到bs4 import BeautifulSoup 　　得到selectolax.parser import HTMLParser 　　　　　　def get_text_bs (html): 　　,,,tree =, BeautifulSoup (html、, & # 39; lxml # 39;) 　　　　,,,body =tree.body 　　,,,if body is 没有: 　　,,,,,,,return 没有　　　　,,,for tag 拷贝body.select(& # 39;脚本# 39;): 　　,,,,,,,tag.decompose () 　　,,,for tag 拷贝body.select(& # 39;风格# 39;): 　　,,,,,,,tag.decompose () 　　　　,,,text =, body.get_text(分隔符=& # 39;\ n # 39;) 　　,,,return 文本　　　　　　def get_text_selectolax (html): 　　,,,tree =, HTMLParser (html) 　　　　,,,if tree.body is 没有: 　　,,,,,,,return 没有　　　　,,,for tag 拷贝tree.css(& # 39;脚本# 39;): 　　,,,,,,,tag.decompose () 　　,,,for tag 拷贝tree.css(& # 39;风格# 39;): 　　,,,,,,,tag.decompose () 　　　　,,,text =, tree.body.text(分隔符=& # 39;\ n # 39;) 　　,,,return 文本　　　　　　def read_doc(记录,解析器=get_text_selectolax): 　　,,,url =record.url 　　,,,text =,没有　　　　,,,if url: 　　,,,,,,,payload =, record.payload.read () 　　,,,,,,,头,,html =, payload.split (b # 39; \ r \ n \ r \ n # 39;,, maxsplit=1) 　　,,,,,,,html =, html.strip () 　　　　,,,,,,,if len (html),祝辞,0: 　　,,,,,,,,,,,text =,解析器(html) 　　　　,,,return url文本　　　　　　def process_warc (file_name,解析器,,限制=10000): 　　,,,warc_file =, warc.open (file_name, & # 39; rb # 39;) 　　,,,t0 =,时间() 　　,,,n_documents =0 　　,,,for 我,record 拷贝列举(warc_file): 　　,,,,,,,url, doc =, read_doc(记录,,解析器) 　　　　,,,,,,,if not doc 或是not url: 　　,,,,,,,,,,,继续　　　　,,,,,,,n_documents +=1 　　　　,,,,,,,if 小姐:祝辞,限制: 　　,,,,,,,,,,,休息　　　　,,,warc_file.close () 　　,,,print(& # 39;解析器:% & # 39;,%,parser.__name__) 　　,,,print (& # 39; Parsing took % s seconds 以及produced % s 文档\ n # 39;, %,(时间(),安康;t0,, n_documents)) 在祝辞祝辞,!,wget https://commoncrawl.s3.amazonaws.com/crawl - data/cc主要- 2018 05/segments/1516084886237.6/warc/cc -主- 20180116070444 - 20180116090444 - 00000. - warc.gz 　　在祝辞祝辞,file_name =,“cc -主- 20180116070444 - 20180116070444 - 00000. - warc.gz" 　　在祝辞祝辞,process_warc (file_name, get_text_selectolax,, 10000) 　　解析器:get_text_selectolax 　　Parsing took 16.170367002487183, seconds 以及produced 3317年,文档　　在祝辞祝辞,process_warc (file_name, get_text_bs,, 10000) 　　解析器:get_text_bs 　　Parsing took 432.6902508735657, seconds 以及produced 3283年,文件

显然,这并不是对某些事物进行基准测试的最佳方法,但是它提供了一个想法,即selectolax有时比lxml快30倍。
selectolax最适合将HTML剥离为纯文本。如果我有10000多个HTML片段,需要将它们作为纯文本索引到Elasticsearch中。(Elasticsearch有一个html_strip文本过滤器,但这不是我想要/不需要在此上下文中使用的过滤器)。事实证明,以这种规模将HTML剥离为纯文本实际上是非常低效的。那么,最有效的方法是什么?

<李>

PyQuery

得到pyquery import pyquery as pq 　　　　text =, pq (html)。text () <李>

selectolax

得到selectolax.parser import HTMLParser 　　　　时间=text HTMLParser (html)。text () <李>

正则表达式

import 再保险　　　　时间=regex re.compile (" # 39; & lt . * ?祝辞& # 39;) 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null 　　null