Python教程:使用BeautifulSoup爬取城市公交信息

在这个Python教程中,我们将学习如何使用BeautifulSoup库爬取城市公交信息。我们将以杭州为例,但你可以将代码应用于其他城市。如果有什么错误或者值得改进的地方,欢迎大家在评论区指出!

首先,确保你已经安装了以下库:

BeautifulSoup
requests
pandas

如果没有,请使用以下命令安装:

1
pip install beautifulsoup4 requests pandas

接下来,我们将编写代码。

1.导入所需的库:

1
2
3
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd

2.设置城市名称和需要的参数:

3.创建一个空的DataFrame来存储公交信息:

1
df = pd.DataFrame(columns=['线路名称', '运行时间', '参考票价', '出发线路', '总站数', '经过站点', '返回线路', '返回总站数', '返回经过站点'])

4.获取公交路线列表:

1
2
3
4
5
6
7
soup1 = BS(requests.get(url).content.decode('utf-8'), 'lxml')
number = soup1.select('div > div.bus-layer.depth.w120 > div:nth-child(1) a')
for element in number:
href_list.append(url + element['href'])
number = soup1.select('div > div.bus-layer.depth.w120 > div:nth-child(2) a')
for element in number:
href_list.append(url + element['href'])

5.遍历每个公交路线,并获取详细信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
for web1 in href_list:
soup2 = BS(requests.get(web1).content.decode('utf-8'), 'lxml')
number = soup2.select('div > div.list.clearfix a')
for element in number:
website = url + element['href']
soup3 = BS(requests.get(website).content.decode('utf-8'), 'lxml')
# 路线名
name = soup3.select('div > div.info > h1 > span')
name = name[0].text
# 运行时间
time = soup3.select('div > div.info > ul > li:nth-child(1)')
time = time[0].text[5:]
# 参考票价
price = soup3.select('div > div.info > ul > li:nth-child(2)')
price = price[0].text[5:]
# 往返线
come_back = soup3.find_all(class_='trip')
come_line = come_back[0].text
if len(come_back) == 2:
back_line = come_back[1].text
else:
back_line = "Null"
# 来路线
come = soup3.select('div.service-area > div:nth-child(2) > ol> li >a')
sum_come = 0
str_come = ""
for element in come:
if 'aria-label' in element.attrs:
str_come += element['aria-label']
sum_come += 1
sum_come = f"共{sum_come}站"
# 反路线
back = soup3.select('div.service-area > div:nth-child(4) > ol> li >a')
sum_back = 0
str_back = ""
for element in back:
if 'aria-label' in element.attrs:
str_back += element['aria-label']
sum_back += 1
if sum_back == 0:
sum_back = "Null"
else:
sum_back = f"共{sum_back}站"

df.loc[cnt] = [name, time, price, come_line, sum_come, str_come, back_line, sum_back, str_back]
cnt += 1

6.将DataFrame保存为CSV文件:

1
df.to_csv(f'{city}.csv', index=False)

7.完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd


df=pd.DataFrame(columns=['线路名称', '运行时间', '参考票价', '出发线路', '总站数', '经过站点', '返回线路', '返回总站数', '返回经过站点'])

# 存储要遍历的网址
href_list = []
soup1=BS(requests.get(url).content.decode('utf-8'),'lxml')
number=soup1.select('div > div.bus-layer.depth.w120 > div:nth-child(1) a')
for element in number:
href_list.append(url+element['href'])
number=soup1.select('div > div.bus-layer.depth.w120 > div:nth-child(2) a')
for element in number:
href_list.append(url+element['href'])

for web1 in href_list:
soup2=BS(requests.get(web1).content.decode('utf-8'),'lxml')
number=soup2.select('div > div.list.clearfix a')
for element in number:
website=url+element['href']
print(website+" ",cnt)
soup3=BS(requests.get(website).content.decode('utf-8'),'lxml')
# 路线名
name=soup3.select('div > div.info > h1 > span')
name=name[0].text
# 运行时间
time=soup3.select('div > div.info > ul > li:nth-child(1)')
time=time[0].text[5:]
# 参考票价
price=soup3.select('div > div.info > ul > li:nth-child(2)')
price=price[0].text[5:]
# 往返线
come_back=soup3.find_all(class_='trip')
come_line=come_back[0].text
if len(come_back)==2:
back_line=come_back[1].text
else:
back_line="Null"
# 来路线
come=soup3.select('div.service-area > div:nth-child(2) > ol> li >a')
sum_come=0
str_come=""
for element in come:
if 'aria-label' in element.attrs:
str_come+=element['aria-label']
sum_come+=1
sum_come=f"共{sum_come}站"
# 反路线
back=soup3.select('div.service-area > div:nth-child(4) > ol> li >a')
sum_back=0
str_back=""
for element in back:
if 'aria-label' in element.attrs:
str_back+=element['aria-label']
sum_back+=1
if sum_back==0:
sum_back="Null"
else:
sum_back=f"共{sum_back}站"

df.loc[cnt]=[name,time,price,come_line,sum_come,str_come,back_line,sum_back,str_back]
cnt+=1

df.to_csv(f'{city}.csv',index=False)