1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
| import requests from lxml import etree import csv
class DaXue(): def __init__(self): self.headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 FS' } self.links=[] self.datas=[]
def get_html(self,url): resp=requests.get(url,self.headers) resp.encoding="utf-8" return resp.text
def parse(self,text): html=etree.HTML(text) links=html.xpath("//div[@class='province']/a/@href") for link in links: self.links.append(link)
def parse_detail(self,text): html=etree.HTML(text) trs=html.xpath("//table[@class='table-x']/tbody/tr[position()>2]") for i in trs: list = [] # print(i.xpath(".//text()")) datas=i.xpath(".//td//text()") for data in datas: if data: list.append(data) else: list.append(" ") self.datas.append(list)
def save_datas(self,data): with open("教育部公布具有招生资格的高校名单.csv","w+",encoding="utf-8") as f: f_csv=csv.writer(f) f_csv.writerows(data)
if __name__ == '__main__': url="https://daxue.eol.cn/mingdan.shtml" dx=DaXue() html=dx.get_html(url) dx.parse(html) for link in dx.links: html2=dx.get_html(link) data=dx.parse_detail(html2) dx.save_datas(dx.datas)
|