2022年 11月 9日

Python访问url的的方式,模拟浏览器

需要的包

import requests
from fake_useragent import UserAgent   ###随机获取ua
import urllib3
import random  ##随机
from requests.adapters import HTTPAdapter   ### 重试
  • 1
  • 2
  • 3
  • 4
  • 5
get——requests
def sendGetRequest(url):
      s = requests.Session()
      urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
      try:
          data = s.get(url, headers=self.headers, verify=False)
      except Exception as e:
          print(e)
          return None
      return data
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
添加cookie的get的请求
def sendGetByCookie(url, cookie):
     s = requests.Session()
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
     headers = {
         # 'user-agent':random.sample(user_agent_w_list[0], 1),
         'user-agent': UserAgent(verify_ssl=False).random,
         # 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
         'Cookie':cookie
     }
     try:
         data = s.get(url, headers=headers, verify=False)
     except Exception as e:
         print(e)
         return None
     return data
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
urllib3的请求方式
def geturl(url, cookie):
     http = urllib3.PoolManager()
     requests.packages.urllib3.disable_warnings()
     r = http.request('GET',
                      url,
                      headers=self.headers
                      )
     return r.data.decode()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
参数为json的get请求
def postJson(self, url, cookie, data, proxy):
    headers = {
        'user-agent': 'com.ss.android.ugc.aweme/251 (Linux; U; Android 4.4.2; zh_CN; OPPO R11; Build/NMF26X; Cronet/58.0.2991.0)',
        'Cookie':cookie,
        'Content-Type': 'application/json'
    }
    r = requests.session()
    requests.packages.urllib3.disable_warnings()
    try:
        res = r.post(url, json=data, headers=headers, proxies=proxy , verify=False)
    except Exception as e:
        print(url, e)
        res = self.cycle(r, headers,url)
    if not res.status_code == 200:
        print('访问状态码', res.status_code)
    res.encoding = 'utf8'
    return res.text
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
通过代理的get的请求
def sendGetByProxy(url, proxies, cookie, paramDict):
     '''
     :param url:
     :param proxies: proxies = {'http': 'http://localhost:8888', 'https': 'http://localhost:8888'}
     :param cookie:
     :param paramDict:
     :return:
     '''
     s = requests.Session()
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
     headers = {
         'user-agent': UserAgent(verify_ssl=False).random,
         'Connection': 'close'     ###释放keep_live
     }
     if cookie != None:
         headers['Cookie'] = cookie
     if paramDict:
         headers.update(paramDict)
     #######重试
     s.mount('http://', HTTPAdapter(max_retries=3))
     s.mount('https://', HTTPAdapter(max_retries=3))

     if proxies:
         try:
             ####设置超时 timeout=5  allow_redirects=False 取消跳转
             data = s.get(url, headers=headers, proxies=proxies, verify=False, allow_redirects=False, timeout=10)

             # requests.adapters.DEFAULT_RETRIES = 5   ####重试次数
             s.keep_alive = False   ####关闭不必要的链接
         except Exception as e:
             print('访问:',url,e)
             data = self.cycle(s,headers, url)
             return data
     else:
         try:
             data = s.get(url, headers=headers, verify=False)
             requests.adapters.DEFAULT_RETRIES = 5
             s.keep_alive = False
         except Exception as e:
             print(e)
             return None
     print(data)
     return data
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
ip出错自动获取新的ip重新获取
def cycle(s, headers, url):
       '''
       如果ip出错就获取新的ip
       :param s:
       :param headers:
       :param url:
       :return:
       '''
       times = 0
       success = False
       data = None
       while times < 5 and not success:
           try:
               ####设置超时 timeout=5  allow_redirects=False 取消跳转
               data = s.get(url,
                            headers=headers,
                            proxies=self.getProxy(),
                            verify=False,
                            allow_redirects=False,
                            timeout=10)

               # requests.adapters.DEFAULT_RETRIES = 5   ####重试次数
               s.keep_alive = False  ####关闭不必要的链接
               success = True
               if not data.status_code == 200:
                   print(url, data.status_code)
                   success = False
           except Exception as e:
               print('访问:', e)
       return data
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
获取代理ip
    def getProxy(proxie):
        proxies = {}
        ip = random.sample(proxie.keys(), 1)[0]
        port = proxie[ip]
        ip = ip.decode('unicode_escape')
        port = port.decode('unicode_escape')
        ip_port = ip + ':' + port
        print('获取到的代理为', ip_port)
        proxies['http'] = 'http://' + ip_port
        proxies['https'] = 'https://' + ip_port
        return proxies
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
模拟谷歌浏览器访问本地文件
def seleniumUtil(chromePath, filePath):
    """
    :param browserPath:  浏览器的驱动所在的路径
    :param filePath:  文件的相对路径
    :return:
    """
    chromedriver = chromePath
    # os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)

    file_Path = "file://" + os.getcwd() + filePath
    # os.getcwd() 文件所在的目录
    driver.get(file_Path)   ###里面也可以是url
    sleep(5)
    data = driver.page_source
    driver.quit()
    return data
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
无界面版谷歌
def ChromeOptionsUtil(chromePath, filePath):
    """
      谷歌无界面版
    :param phantomJs_Path:
    :param filePath:
    :return:
    """
    options = webdriver.ChromeOptions()
    # chrome_options = Options()
    options.add_argument("--headless")  # 设置谷歌为无界面模式
    # chrome_options.add_argument("--disable-gpu")
    file_Path = "file://" + os.getcwd() + filePath
    driver = webdriver.Chrome(chrome_options=options, executable_path=chromePath)  # 第一歌参数是谷歌options, 第二个参数是chromedriver的路径
    driver.get(file_Path)
    sleep(2)
    data = driver.page_source
    driver.close()
    driver.quit()
    return data
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
获取cookie,并且处理成可用cookie
def test():
    driver = webdriver.Chrome('D:/chromedriver')
    driver.get("https://*****")
    sleep(3)
    driver.find_element(By.CSS_SELECTOR, ' li:nth-child(1) > div > div.live-card-following-info > p.live-card-following-info-user > a').click()
    ###获取cookie
    cookie = driver.get_cookies()
    sleep(5)
    driver.close()
    driver.quit()
    cookies = []
    ###处理获取到的cookie字典成字符串
    for i in cookie:
        cookies.append(i["name"] + "=" + i["value"])
    cookiestr = '; '.join(item for item in cookies)
    ru = redis_Util()
    res = ru.redis_py('127.0.0.1', '6379', '', 1)
    res.hset('cookies', cookiestr, '')

    return cookiestr
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

网页乱码问题

#response中添加
res.encoding='gbk'
print(res.text)
  • 1
  • 2
  • 3