直接使用 Requests 模块进行 URL 数据拉取

def get_url_html(self, task_url):
    html = ''
    i = 0
    get_status = False
    while i < 2:
        try:
            # 直接读文件大小
            # response = requests.get(url=task_url, headers=self.headers, timeout=300, stream=True)
            # actual_length = len(response.content)

            # 文件过大时使用,遍历数据块并求和它们的大小
            with requests.get(url=task_url, headers=self.headers, timeout=300, stream=True) as response:
                actual_length = sum(len(chunk) for chunk in response.iter_content(8196))

            expected_length = response.headers.get('Content-Length')
            if expected_length:
                logging.info(expected_length)
                logging.info(actual_length)
                expected_length = int(expected_length)
                if actual_length < expected_length:
                    logging.error(" =================     Content-Length  URL: {} , Error: 提取内容不完整!     ================= ".format(task_url))
                    requests_state = False
                else:
                    requests_state = True
                if response.status_code == 200 and requests_state:
                    html = response.content
                    get_status = True
                    break
                else:
                    i += 1
        except Exception as e:
            i += 1
            logging.error(" =================     Get  URL: {} , Error: {}     ================= ".format(task_url, e))
    return html, get_status