scrapy结合加速乐完成无损爬取

中间件重要知识: 返回None代表继续通过 返回request就重新走调度流程

本文文章中 需要在settings中设置 COOKIES_ENABLED = True

这里超链接一篇文章(不理解COOKIES_ENABLED 的可以看看)—scrapy 中 COOKIES_ENABLED 设置_NealHuiwen的博客-CSDN博客

其次需要开启爬虫中的cookie中间件

utilsMiddlewares.py >>>>>>>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class HandleCookieMiddleware:
def process_request(self, request, spider):
pass
def process_response(self, request, response, spider):
status_code = response.status
if status_code == 521:
print("jsl>>>>>>", response.url)
cookies, response, ok = self.get_cookie(response)
if not ok:
return response

request.cookies = cookies
request.headers[
'User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
return request
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def get_cookie(self, response) -> (dict, Response, bool):
if pattern := re.findall(r'cookie=(.*?);location', response.text):
encrypt_jsfuck = pattern[0]
else:
return '', response, False
# return response
DJ = DecryptJsl()
decrypt_jsfuck = str(execjs.eval(encrypt_jsfuck))
first_decrypt_cookie = decrypt_jsfuck.split('=')[1].split(';')[0]
jsl_cookie_key = decrypt_jsfuck.split('=')[0]
first_set_cookie = {}
for c in str(response.headers.getlist('Set-Cookie')[0].decode("utf-8")).split(";"):
if "=" not in c:
continue
first_set_cookie[c.split("=")[0]] = c.split("=")[1]
first_cookies = {
**first_set_cookie,
jsl_cookie_key: first_decrypt_cookie}
res1 = requests.get(response.url, cookies=first_cookies, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
})
data = json.loads(re.findall(r';go\((.*?)\)', res1.text)[0])
second_decrypt_cookie = DJ.get_parameter(data)
cookies = {
**first_set_cookie,
jsl_cookie_key: second_decrypt_cookie}
return cookies, None, True

2024-4-24

上面方法是我自实现的方法。可能很多时候不适用。

这几天入职新公司 遇到个新方法 非常好用

直接看源码。好用成DOG啊。看到这里的同学 你遇到宝了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def process_response(self, request, response, spider):
if response.status == 521:
print("jsl>>>>>>", response.url)
DJ = DecryptJsl()
cookies = DJ.get_cookies(response.url)

# request.cookies = cookies
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
req_res = requests.get(response.url, headers=headers, cookies=cookies)

response = TextResponse(url=req_res.url,
body=req_res.text,
encoding='utf-8')

return response

return response