教育部高校名单数据爬虫

Snipaste_2021-03-18_19-42-06

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import requests
from lxml import etree
import csv

class DaXue():
def __init__(self):
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 FS'
}
self.links=[]
self.datas=[]

def get_html(self,url):
resp=requests.get(url,self.headers)
resp.encoding="utf-8"
return resp.text

def parse(self,text):
html=etree.HTML(text)
links=html.xpath("//div[@class='province']/a/@href")
for link in links:
self.links.append(link)

def parse_detail(self,text):
html=etree.HTML(text)
trs=html.xpath("//table[@class='table-x']/tbody/tr[position()>2]")
for i in trs:
list = []
# print(i.xpath(".//text()"))
datas=i.xpath(".//td//text()")
for data in datas:
if data:
list.append(data)
else:
list.append(" ")
self.datas.append(list)

def save_datas(self,data):
with open("教育部公布具有招生资格的高校名单.csv","w+",encoding="utf-8") as f:
f_csv=csv.writer(f)
f_csv.writerows(data)


if __name__ == '__main__':
url="https://daxue.eol.cn/mingdan.shtml"
dx=DaXue()
html=dx.get_html(url)
dx.parse(html)
for link in dx.links:
html2=dx.get_html(link)
data=dx.parse_detail(html2)
dx.save_datas(dx.datas)