(365)
(8)
(130)
(155)
(50)
(22)
分类: python/ruby
2021-09-16 17:00:53
import re
import requests
from lxml import etree
class spider(object):
def __init__(self):
self.base_url = '{}/'
self.headers = {
'cookie': 'csrftoken=mixu7wxav35yyycdhqbxciow3z3ms0nh31jbbqh; sessionid='
'344bo4nowvp9misa9suynjiwz2i5jcof; file_2922585_readed=""; file_2302034_readed=""',
'referer': '',
'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36'
' (khtml, like gecko) chrome/93.0.4577.63 safari/537.36'
}
self.count = 0
def get_html(self, url):
html = requests.get(
url=url,
headers=self.headers
).text
return html
def xpath_func(self, html):
name_bds = '//tbody/tr[./td/span[@title="已通过这道题目"]]/td/a/text()'
base_obj = etree.html(html)
name_lists = base_obj.xpath(name_bds)
l = []
for i in name_lists:
l.append(i.strip())
return l
def re_func(self, html, re_bds):
pattern = re.compile(re_bds, re.s)
re_list = pattern.findall(html)
return re_list
def parse_html(self, url):
html = self.get_html(url)
l = self.xpath_func(html)
return l
def run(self):
warning = input('您马上就要爬取acwing了,看一下你的做题数,您的劳动成果将会在下面展示出来,外汇跟单gendan5.com确定要看吗?(y/n)')
if warning == 'y':
print('爬虫系统已经启动...正在努力抓取,请稍等....')
print(' --------------------------------- ')
print('| name |')
print(' --------------------------------- ')
for i in range(1, 80):
url = self.base_url.format(i)
l = self.parse_html(url)
for _ in l:
self.count = 1
print('| ' _)
print(' --------------------------------- ')
print('经过您的不懈努力,您一共做了' str(self.count) '道题,继续努力!!')
else:
print('已经退出,你这个弱者')
if __name__ == '__main__':
spider = spider()
spider.run()
上一篇:
下一篇: