(一)古诗爬取
访问古诗文网站名句主页(https://so.gushiwen.cn/mingjus/),爬取里面的名句和出处(包括链接)保存到一个文本文件poems.txt中去。每个名句占用一行,内容格式如下:
编号(从1开始,占3位做对齐):名句–出处(全诗链接)
空两格(诗句的译文注释和赏析)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 from bs4 import BeautifulSoup as BSimport requestsrank=0 temp_line2='' fs = open ("诗词.txt" ,'w' ,encoding='utf-8' ) soup=BS(requests.get("https://so.gushiwen.cn/mingjus/" ).content.decode("utf-8" ),"lxml" ) content=soup.select('body > div.main3 > div.left > div.sons > div.cont' ) for i in content: str =i.find_all('a' ) url='https://so.gushiwen.cn' +i.find('a' )['href' ]; temp_soup=BS(requests.get(url).content.decode("utf-8" ),"lxml" ) temp_content=temp_soup.select('#sonsyuanwen > div.cont > div.contson' ) for x in temp_content: temp_line1 = x.text.split('\n' ) for z in temp_line1: temp_line2+=" " +z+'\n' line2=temp_line2[:-1 ] temp_line2='' poem=str [0 ].text if (len (str )==1 ): poet="没有出处" else : poet = "出自" +str [1 ].text rank+=1 line1=f"{rank} :" +poem+"--" +poet+'(' +url+')' fs.write('{0:>3}' .format (line1)) fs.write(line2) fs.close()
(二)显示影片基本信息
访问豆瓣电影Top250(https://movie.douban.com/top250?start=0),
获取每部电影的中文片名、排名、评分及其对应的链接,按照“排名-中文片名-评分-链接”的格式显示在屏幕上。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 from bs4 import BeautifulSoup as BSimport requestsrank=0 fs = open ("豆瓣.txt" ,'w' ,encoding='utf-8' ) headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } for page in range (0 ,251 ,25 ): soup=BS(requests.get(f"https://movie.douban.com/top250?start={page} &filter=" ,headers=headers).content.decode("utf-8" ),"lxml" ) content=soup.select('#content > div > div.article > ol > li > div > div.info ' ) for i in content: rank=rank+1 name=i.select('.title' ) star=i.select('.rating_num' ) url=i.find('a' )['href' ] print ("排名:" +str (rank)+"-《" +name[0 ].text+"》-评分:" +star[0 ].text+"-链接:" +url) fs.write("排名:" +str (rank)+"-《" +name[0 ].text+"》-评分:" +star[0 ].text+"-链接:" +url+'\n' ) fs.close()
(三)存储影片详细信息
访问豆瓣电影Top250(https://movie.douban.com/top250?start=0)
在问题1的基础上,获取每部电影的导演、编剧、主演、类型、上映时间、片长、评分人数以及剧情简介等信息,并将获取到的信息保存至本地文件中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 from bs4 import BeautifulSoup as BSimport requestsfs = open ("豆瓣.txt" ,'w' ,encoding='utf-8' ) headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } for page in range (0 ,250 ,25 ): soup=BS(requests.get(f"https://movie.douban.com/top250?start={page} &filter=" ,headers=headers).content.decode("utf-8" ),"lxml" ) content=soup.select('#content > div > div.article > ol > li > div > div.info ' ) for i in content: name="电影:" +i.select('.title' )[0 ].text url=i.find('a' )['href' ] sp=BS(requests.get(url,headers=headers).content.decode("utf-8" ),"lxml" ) mainer=i.find('p' ) mainer=mainer.text.strip() writers=sp.select('#info > span' ) writers=writers[1 ].text all_time=sp.select('span[property="v:runtime"]' ) all_time="时长:" +all_time[0 ].text number=i.find_all('span' ) number="评分人数:" +number[-2 ].text brief=sp.select('span[property="v:summary"]' ) brief="简介:" +brief[0 ].text all =name+"\n" +mainer+"\n" +writers+"\n" +all_time+"\n" +number+"\n" +brief+"\n" fs.write(all +'\n\n' ) fs.close()
(四)访问热搜榜并发送邮件
访问微博热搜榜(https://s.weibo.com/top/summary)
获取微博热搜榜前50条热搜名称、链接及其实时热度,并将获取到的数据通过邮件的形式,每20秒发送一次到个人邮箱中。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 import requestsfrom bs4 import BeautifulSoup as BSimport smtplibfrom email.mime.multipart import MIMEMultipartfrom email.mime.text import MIMETextimport timedef get_line (): headers = { "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" , 'Cookie' :'4210214103(保密了)' } url = 'https://s.weibo.com/top/summary' soup=BS(requests.get(url,headers=headers).content.decode('utf-8' ),'lxml' ) tips=soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > a' ) hot=soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > span' ) x=soup.select('#pl_top_realtimehot > table > tbody > tr > td.td-02 > a' ) line='\n' for i in range (1 ,51 ): line+=f"Top{i} :" +tips[i].text+" 热度:" +hot[i].text+"\n地址:https://s.weibo.com" +x[i].get('href' )+'\n\n' return line def send_email (line ): sender_email = "941521358@qq.com" receiver_email = "ldyertop@qq.com" password = "4210214103(保密了)" message = MIMEMultipart() message['From' ] = sender_email message['To' ] = receiver_email message['Subject' ] = "Ldyer的微博热搜" message.attach(MIMEText(line, 'plain' )) with smtplib.SMTP_SSL('smtp.qq.com' , 465 ) as server: server.login(sender_email, password) server.sendmail(sender_email, receiver_email, message.as_string()) while True : send_email(get_line()) print ("邮件已发送" ) time.sleep(20 )