爬虫代理淘宝反爬基础(2)

1.对于反爬虫机制的处理
(1)使用代理
适用情况:大部分网站均限制了IP的访问量
对于“频繁点击”的情况,我们还可以通过限制爬虫访问网站的频率来避免被网站禁掉。
#! -- encoding:utf-8 --
import requests
import random
# 要访问的目标页面
targetUrl = “http://httpbin.org/ip”
# 要访问的目标HTTPS页面
# targetUrl = “https://httpbin.org/ip”
# 代理服务器(产品官网 www.16yun.cn)
proxyHost = “t.16yun.cn”
proxyPort = “31111”
# 代理隧道验证信息
proxyUser = “username”
proxyPass = “password”
proxyMeta = “http://%(user)s:%(pass)s@%(host)s:%(port)s” % {
“host” : proxyHost,
“port” : proxyPort,
“user” : proxyUser,
“pass” : proxyPass,
}
# 设置 http和https访问都是用HTTP代理
proxies = {
“http” : proxyMeta,
“https” : proxyMeta,
}
# 设置IP切换头
tunnel = random.randint(1,10000)
headers = {“Proxy-Tunnel”: str(tunnel)}
resp = requests.get(targetUrl, proxies=proxies, headers=headers)
print resp.status_code
print resp.text
(2)mitmproxy过滤
其次针对navigator.webdriver 通过mitmproxy做中间人代理将对应的屏蔽代码注入到原网站中从而达到规避检测目的
首先是配置mitmproxy
注入屏蔽代码
TARGET_URL = ‘https://g.alicdn.com/secdev/sufei_data/3.6.8/index.js’
INJECT_TEXT = ‘Object.defineProperties(navigator,{webdriver:{get:() => false}});’
def response(flow):
if flow.request.url.startswith(TARGET_URL):
flow.response.text = INJECT_TEXT + flow.response.text
print(‘注入成功’)
if ‘um.js’ in flow.request.url or ‘115.js’ in flow.request.url:
# 屏蔽selenium检测
flow.response.text = flow.response.text + INJECT_TEXT
在运行前启动代理
mitmdump -s httpProxy.py -p 9000
代码例子

-- coding:UTF-8 --

import time
import re
from datetime import date, timedelta
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
TB_LOGIN_URL = ‘https://login.taobao.com/member/login.jhtml’
CHROME_DRIVER = ‘/usr/local/bin/chromedriver’ # Windows和Mac的配置路径不一样
class SessionException(Exception):
“”"
会话异常类
“”"
def init(self, message):
super().init(self)
self.message = message
def str(self):
return self.message
class Crawler:
def init(self):
self.browser = None
def start(self, username, password):
print(“初始化浏览器”)
self.__init_browser()
print(“切换至密码输入框”)
self.__switch_to_password_mode()
time.sleep(0.5)
print(“输入用户名”)
self.__write_username(username)
time.sleep(2.5)
print(“输入密码”)
self.__write_password(password)
time.sleep(3.5)
print(“程序模拟解锁”)
if self.__lock_exist():
self.__unlock()
print(“开始发起登录请求”)
self.__submit()
time.sleep(4.5)
# 登录成功,直接请求页面
print(“登录成功,跳转至目标页面”)
self.__navigate_to_target_page()
time.sleep(6.5)
print(“解析页面文本”)
crawler_list = self.__parse_page_content();
# 连接数据库并保存数据
print(“保存数据到mysql数据库”)
self.__save_list_to_db(crawler_list)
def __switch_to_password_mode(self):
“”"
切换到密码模式
:return:
“”"
if self.browser.find_element_by_id(‘J_QRCodeLogin’).is_displayed():
self.browser.find_element_by_id(‘J_Quick2Static’).click()
def __write_username(self, username):
“”"
输入账号
:param username:
:return:
“”"
username_input_element = self.browser.find_element_by_id(‘TPL_username_1’)
username_input_element.clear()
username_input_element.send_keys(username)
def __write_password(self, password):
“”"
输入密码
:param password:
:return:
“”"
password_input_element = self.browser.find_element_by_id(“TPL_password_1”)
password_input_element.clear()
password_input_element.send_keys(password)
def __lock_exist(self):
“”"
判断是否存在滑动验证
:return:
“”"
return self.__is_element_exist(’#nc_1_wrapper’) and self.browser.find_element_by_id(
‘nc_1_wrapper’).is_displayed()
def __unlock(self):
“”"
执行滑动解锁
:return:
“”"
bar_element = self.browser.find_element_by_id(‘nc_1_n1z’)
ActionChains(self.browser).drag_and_drop_by_offset(bar_element, 800, 0).perform()
time.sleep(1.5)
self.browser.get_screenshot_as_file(‘error.png’)
if self.__is_element_exist(’.errloading > span’):
error_message_element = self.browser.find_element_by_css_selector(’.errloading > span’)
error_message = error_message_element.text
self.browser.execute_script(‘noCaptcha.reset(1)’)
raise SessionException(‘滑动验证失败, message = ’ + error_message)
def __submit(self):
“”"
提交登录
:return:
“”"
self.browser.find_element_by_id(‘J_SubmitStatic’).click()
time.sleep(0.5)
if self.__is_element_exist("#J_Message"):
error_message_element = self.browser.find_element_by_css_selector(’#J_Message > p’)
error_message = error_message_element.text
raise SessionException(‘登录出错, message = ’ + error_message)
#跳转至目标页面
def __navigate_to_target_page(self):
pass
# 解析网页数据
def __parse_page_content(self):
pass
#保存数据
def __save_list_to_db(self, crawler_list):
pass
def __init_browser(self):
“”"
初始化selenium浏览器
:return:
“”"
options = Options()
# options.add_argument("–headless")
prefs = {“profile.managed_default_content_settings.images”: 1}
options.add_experimental_option(“prefs”, prefs)
options.add_argument(’–proxy-server=http://127.0.0.1:9000’)
options.add_argument(‘disable-infobars’)
options.add_argument(’–no-sandbox’)
self.browser = webdriver.Chrome(executable_path=CHROME_DRIVER, options=options)
self.browser.implicitly_wait(3)
self.browser.maximize_window()
self.browser.get(TB_LOGIN_URL)
#执行命令行
Crawler().start(‘username’), ‘password’))
这里给出一个比较简易的登录方式,即选择微博登录绕开可能存在滑动验证码的情况
try:
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument(’–headless’)
# 下一行代码是为了以开发者模式打开chrome
chrome_options.add_experimental_option(‘excludeSwitches’,[‘enable-automation’])
browser = webdriver.Chrome(options=chrome_options)
browser.get(“https://s.taobao.com/search?q=iPad”)
button = browser.find_element_by_class_name(‘login-switch’)
button.click()
button = browser.find_element_by_class_name(‘weibo-login’)
button.click()
user_name = browser.find_element_by_name(‘username’)
user_name.clear()
user_name.send_keys(‘’) #输入微博名 需要事先绑定淘宝
time.sleep(1)
user_keys = browser.find_element_by_name(‘password’)
user_keys.clear()
user_keys.send_keys('
’) #输入微博密码
time.sleep(1)
button = browser.find_element_by_class_name(‘W_btn_g’)
button.click()
time.sleep(1)
cookies = browser.get_cookies()
ses=requests.Session() # 维持登录状态
c = requests.cookies.RequestsCookieJar()
for item in cookies:
c.set(item[“name”],item[“value”])
ses.cookies.update©
ses=requests.Session()
time.sleep(1)
print(‘登录成功’)
except:
print(“登录失败”)
2. 伪装成浏览器,或者反“反盗链”
有些网站会检查你是不是真的浏览器访问,还是机器自动访问的。这种情况,加上User-Agent,表明你是浏览器访问即可。有时还会检查是否带Referer信息还会检查你的Referer是否合法,一般再加上Referer。
User-Agent可以用亿牛云提供给的真实库,Referer的来源可以伪装成百度搜索来的。
headers = {‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.1276.73 Safari/537.36’, ‘Referer’:‘https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=nike’}
response = requests.get(url=url, headers=headers)
爬虫是长期进行的任务,所以需要配合代理才能更好的获取数据,亿牛云爬虫代理加强版可以让我们爬虫程序长期稳定的进行数据采集。


版权声明:本文为Laicaling原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
THE END
< <上一篇
下一篇>>