Selenium 百度资源模拟登陆

涉及工具:selenium、pytesseract、requests、phantomjs
实现:IP代理、验证码、模拟操作、图片识别

安装ocr

可以查看着篇文章《Python OCR安装》

安装phtomjs

可以查看着篇文章《selenium phantomjs安装》

代码

main.py

# -*- coding: utf-8 -*-
# @Time    : 2019/4/22 13:56
# @Author  : Lu Baogui
# @Email   : 15766972573@qq.com
# @File    : main.py
# @Software: PyCharm


import pytesseract
import requests

from PIL import Image
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from fake_useragent import UserAgent
from selenium.webdriver.common.proxy import ProxyType

dcap = dict(DesiredCapabilities.PHANTOMJS)


def ocr_image(filename):
    # open image
    image = Image.open(filename)
    code = pytesseract.image_to_string(image, lang='chi_sim')
    return code


def request_download(IMAGE_URL):
    # get img
    r = requests.get(IMAGE_URL)
    with open('./img.png', 'wb') as f:
        f.write(r.content)
    return ocr_image('./img.png')


def login_baidu():
    # login_baidu
    try:
        dcap["phantomjs.page.settings.userAgent"] = (UserAgent(path='./ua.json').random)
        proxy = webdriver.Proxy()
        proxy.proxy_type = ProxyType.MANUAL
        proxy.http_proxy = '11.129.171.521:800'
        driver = webdriver.PhantomJS(executable_path='./phantomjs-2.1.1-linux-x86_64/bin/phantomjs')
        driver.get('https://ziyuan.baidu.com/login/index?u=/site/index')
        print('1', driver.page_source)
        driver.find_element_by_id("TANGRAM__PSP_4__userName").send_keys("********")
        driver.find_element_by_id("TANGRAM__PSP_4__password").send_keys("********")
        img_url = driver.find_element_by_id("TANGRAM__PSP_4__verifyCodeImg").get_attribute("src")
        img_ocr = request_download(img_url)
        print('2', img_ocr)
        driver.find_element_by_id("TANGRAM__PSP_4__verifyCode").send_keys(img_ocr)
        driver.find_element_by_id("TANGRAM__PSP_4__submit").click()
        print('3', driver.page_source)
    except Exception as e:
        print('error', e)
    finally:
        driver.close()


if __name__ == "__main__":
    login_baidu()

点赞
  1. 百度反爬机制好灵敏,爬了两次就把我的ip给封了。有需要可以查看我的代理ip手机的代码,加入定时更换代理ip模块。

  2. 代理ip爬虫已经补在Github

发表评论

电子邮件地址不会被公开。 必填项已用*标注

Title - Artist
0:00