(一)古诗爬取

访问古诗文网站名句主页(https://so.gushiwen.cn/mingjus/),爬取里面的名句和出处(包括链接)保存到一个文本文件poems.txt中去。每个名句占用一行,内容格式如下:
编号(从1开始,占3位做对齐):名句–出处(全诗链接)
空两格(诗句的译文注释和赏析)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from bs4 import BeautifulSoup as BS
import requests
#变量
rank=0
temp_line2=''
fs = open("诗词.txt",'w',encoding='utf-8')
soup=BS(requests.get("https://so.gushiwen.cn/mingjus/").content.decode("utf-8"),"lxml")
content=soup.select('body > div.main3 > div.left > div.sons > div.cont')
for i in content:
#诗词出处、网址
str=i.find_all('a')
url='https://so.gushiwen.cn'+i.find('a')['href'];
temp_soup=BS(requests.get(url).content.decode("utf-8"),"lxml")
# 诗词翻译内容
temp_content=temp_soup.select('#sonsyuanwen > div.cont > div.contson')
for x in temp_content:
temp_line1 = x.text.split('\n')
for z in temp_line1:
temp_line2+=" "+z+'\n'
line2=temp_line2[:-1]
temp_line2=''
poem=str[0].text
if(len(str)==1):
poet="没有出处"
else: poet = "出自"+str[1].text
rank+=1
line1=f"{rank}:"+poem+"--"+poet+'('+url+')'
fs.write('{0:>3}'.format(line1))
fs.write(line2)
fs.close()

(二)显示影片基本信息

访问豆瓣电影Top250(https://movie.douban.com/top250?start=0),
获取每部电影的中文片名、排名、评分及其对应的链接,按照“排名-中文片名-评分-链接”的格式显示在屏幕上。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from bs4 import BeautifulSoup as BS
import requests

rank=0
fs = open("豆瓣.txt",'w',encoding='utf-8')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
for page in range(0,251,25):
soup=BS(requests.get(f"https://movie.douban.com/top250?start={page}&filter=",headers=headers).content.decode("utf-8"),"lxml")
content=soup.select('#content > div > div.article > ol > li > div > div.info ')
for i in content:
rank=rank+1
name=i.select('.title')
star=i.select('.rating_num')
url=i.find('a')['href']
print("排名:"+str(rank)+"-《"+name[0].text+"》-评分:"+star[0].text+"-链接:"+url)
fs.write("排名:"+str(rank)+"-《"+name[0].text+"》-评分:"+star[0].text+"-链接:"+url+'\n')
fs.close()

(三)存储影片详细信息

访问豆瓣电影Top250(https://movie.douban.com/top250?start=0)
在问题1的基础上,获取每部电影的导演、编剧、主演、类型、上映时间、片长、评分人数以及剧情简介等信息,并将获取到的信息保存至本地文件中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from bs4 import BeautifulSoup as BS
import requests

fs = open("豆瓣.txt",'w',encoding='utf-8')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
for page in range(0,250,25):
soup=BS(requests.get(f"https://movie.douban.com/top250?start={page}&filter=",headers=headers).content.decode("utf-8"),"lxml")
content=soup.select('#content > div > div.article > ol > li > div > div.info ')
for i in content:
# 电影
name="电影:"+i.select('.title')[0].text
url=i.find('a')['href']
sp=BS(requests.get(url,headers=headers).content.decode("utf-8"),"lxml")
# 导演、主演、类型、上映时间
mainer=i.find('p')
mainer=mainer.text.strip()
# # 编剧
writers=sp.select('#info > span')
writers=writers[1].text
# # 片长
all_time=sp.select('span[property="v:runtime"]')
all_time="时长:"+all_time[0].text
# 评分人数
number=i.find_all('span')
number="评分人数:"+number[-2].text
# 简介
brief=sp.select('span[property="v:summary"]')
brief="简介:"+brief[0].text
all=name+"\n"+mainer+"\n"+writers+"\n"+all_time+"\n"+number+"\n"+brief+"\n"
fs.write(all+'\n\n')
fs.close()

(四)访问热搜榜并发送邮件

访问微博热搜榜(https://s.weibo.com/top/summary)
获取微博热搜榜前50条热搜名称、链接及其实时热度,并将获取到的数据通过邮件的形式,每20秒发送一次到个人邮箱中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import requests
from bs4 import BeautifulSoup as BS
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import time

def get_line():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
'Cookie':'4210214103(保密了)'
}
url = 'https://s.weibo.com/top/summary'
soup=BS(requests.get(url,headers=headers).content.decode('utf-8'),'lxml')
tips=soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > a')
hot=soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > span')
x=soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > a')
line='\n'
for i in range(1,51):
line+=f"Top{i}:"+tips[i].text+" 热度:"+hot[i].text+"\n地址:https://s.weibo.com"+x[i].get('href')+'\n\n'
return line
def send_email(line):
sender_email = "941521358@qq.com"
receiver_email = "ldyertop@qq.com"
password = "4210214103(保密了)"
message = MIMEMultipart()
message['From'] = sender_email
message['To'] = receiver_email
message['Subject'] = "Ldyer的微博热搜"
message.attach(MIMEText(line, 'plain'))
with smtplib.SMTP_SSL('smtp.qq.com', 465) as server:
server.login(sender_email, password)
server.sendmail(sender_email, receiver_email, message.as_string())
while True:
send_email(get_line())
print("邮件已发送")
time.sleep(20)