1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
| import requests from lxml import etree from hashlib import md5 import re
def get_page(page): data={ "append": "list-archive", "paged": page, "action": "ajax_load_posts", "query": "25", "page": "cat" }
headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "_ga=GA1.2.879115049.1597016349; _gid=GA1.2.29282622.1597016349; Hm_lvt_a5eba7a40c339f057e1c5b5ac4ab4cc9=1597016349; _GPSLSC=; verynginx_sign_cookie=39154348106419081c88a4fccdff4f0a; verynginx_sign_javascript=ebf13703c41835d486309e30ef152c1e; Hm_lpvt_a5eba7a40c339f057e1c5b5ac4ab4cc9=1597045709; _gat_gtag_UA_127463675_2=1" } re=requests.post("https://www.vmgirls.com/wp-admin/admin-ajax.php",data=data,headers=headers) html=etree.HTML(re.text) res=html.xpath("//div[@class='list-body']/a/@href")
return res
def get_detials(res): headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "_ga=GA1.2.879115049.1597016349; _gid=GA1.2.29282622.1597016349; Hm_lvt_a5eba7a40c339f057e1c5b5ac4ab4cc9=1597016349; _GPSLSC=; verynginx_sign_cookie=39154348106419081c88a4fccdff4f0a; verynginx_sign_javascript=ebf13703c41835d486309e30ef152c1e; Hm_lpvt_a5eba7a40c339f057e1c5b5ac4ab4cc9=1597046540; _gat_gtag_UA_127463675_2=1" } url=[] for i in res: result=requests.get(i,headers=headers) urllist=re.findall('<a href="(.*?)" alt=".*?" title=".*?">',result.text) url.append(urllist) urls=[] for i in url: for j in i: j="https://www.vmgirls.com/"+j urls.append(j) return urls
def download(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36", "cookie": "__ga=GA1.2.879115049.1597016349; _gid=GA1.2.29282622.1597016349; Hm_lvt_a5eba7a40c339f057e1c5b5ac4ab4cc9=1597016349; _GPSLSC=; verynginx_sign_cookie=39154348106419081c88a4fccdff4f0a; verynginx_sign_javascript=ebf13703c41835d486309e30ef152c1e; Hm_lpvt_a5eba7a40c339f057e1c5b5ac4ab4cc9=1597045839" } for i in url: img=requests.get(i,headers=headers,timeout=(3,7)) with open(f"./imgs/{md5(img.content).hexdigest()}.jpeg","wb") as f: f.write(img.content) print(f"下载图片:{md5(img.content).hexdigest()}.jpeg")
def main(): for i in range(1,45): print(f"正在爬取第{i}页~~~") res=get_page(i) url=get_detials(res) download(url)
main()
|