背景
在做推荐系统的时候需要训练集,所以就自己写了一个爬虫,然后用 bs4 库对爬取的 html 数据处理得到新闻的 URL 链接。
get_url() 函数是用来爬取数据并处理 html 内容的,其余的函数是针对特定类别的新闻数据而写的。
以下是源代码:
# coding=utf-8
from urllib import urlopen
from bs4 import BeautifulSoup
def get_url(url, doc, label, name):
html = urlopen(url)
bs_obj = BeautifulSoup(html.read(), 'html.parser')
#指针指向doc末尾写入
f = open(doc, 'a')
url_list = bs_obj.find_all(label, name)
for address in url_list:
# print(address.find('a')['href'])
address_url = address.find('a')['href']
if 'http' in address_url:
if 'ifeng' in address_url or 'sina' in address_url:
a = address_url
f.write(a)
f.write('\n')
else:
continue
elif 'n1' in address_url:
if 'sports' in doc:
a = 'http://sports.people.com.cn' + address_url
elif 'science' in doc:
a = 'http://scitech.people.com.cn' + address_url
elif 'culture' in doc:
a = 'http://culture.people.com.cn' + address_url
elif 'film' in doc:
a = 'http://ent.people.com.cn' + address_url
elif 'game' in doc:
a = 'http://game.people.com.cn' + address_url
elif 'education' in doc:
a = 'http://edu.people.com.cn/GB' + address_url
f.write(a)
f.write('\n')
#print(a)
f.close()
html.close()
return
#教育
def education():
smalltype = ['208610', '227065', '227057', '226718', '367001']
page = 7
for type in smalltype:
for i in range(1, page+1):
url = 'http://edu.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'education.txt', 'li', '')
#游戏
def game():
smalltype = ['163384', '48661', '48662']
page = 10
for type in smalltype:
for i in range(1, page+1):
url = 'http://game.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'game.txt', 'li', '')
#影视1
def film1():
smalltype = ['6', '1370']
# 15*20
#凤凰只允许访问前20页
for type in smalltype:
for i in range(1, 21):
url = 'http://ent.ifeng.com/listpage/' + type + '/' + str(i) + '/list.shtml'
print(url)
get_url(url, 'film.txt', 'div', 'box_list clearfix')
#影视2
def film2():
smalltype = ['81372', '81374']
page = 7
for type in smalltype:
for i in range(1, page+1):
url = 'http://ent.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'film.txt', 'li', '')
#文化1
def culture1():
smalltype = ['87423', '40473/40476', '22219', '27296']
page = 7
for type in smalltype:
for i in range(1, page+1):
url = 'http://culture.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'culture.txt', 'li', '')
#文化2
def culture2():
smalltype = ['206244/356129','206244/356132','206254/206256','206246/408046', '206246/408047', '206261', '206244/356130']
page = ['6', '3', '2', '2', '5', '2', '3']
typedict = {'206244/356129' : 6, '206244/356132' : 3, '206254/206256' : 2,'206246/408046' : 2, '206246/408047' : 5, '206261' : 2, '206244/356130' : 3}
for type in typedict:
#print(type + ":" + str(typedict[type]))
for i in range(1, typedict[type]+1):
url = 'http://art.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'culture.txt', 'div', 'hdNews clearfix')
culture1()
#科技
def science():
smalltype = ['25895','1056','41163','25893']
page = 7
for type in smalltype:
for i in range(1, page+1):
url = 'http://scitech.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'science.txt', 'li', '')
#体育1
def sports1():
smalltype = ['22155','22134','22141','22149']
page = 7
for type in smalltype:
for i in range(1, page+1):
url = 'http://sports.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'sports.txt', 'div', 'hdNews clearfix')
#体育2
def sports2():
smalltype = ['22176', '35862', '22172']
page = 4
for type in smalltype:
for i in range(1, page + 1):
url = 'http://sports.people.com.cn/GB/' + type + '/index' + str(i) + '.html'
print(url)
get_url(url, 'sports.txt', 'li', '')
#娱乐
def entertainment():
smalltype = ['3', '1370', '30741', '44169']
for smallertype in smalltype:
#15*20*4
for i in range(1, 21):
url = 'http://ent.ifeng.com/listpage/' + str(smallertype) + '/' + str(i) + '/list.shtml'
print(url)
get_url(url, 'entertainment.txt', 'div', 'box_list clearfix')
#社会
def society():
smalltype = ['zqsk', 'fz-shyf', 'qwys', 'shwx']
for type in smalltype:
#40*10*4
for i in range(1, 11):
url = 'http://roll.news.sina.com.cn/news/shxw/' + str(type) + '/index_' + str(i) + '.shtml'
print(url)
get_url( url, 'society.txt', 'li', '')
版权声明:本文为Planeswalker23所创,转载请带上原文链接,感谢。
文档信息
- 本文作者:Planeswalker23
- 本文链接:https://planeswalker23.github.io/2018/05/01/python-bs4/
- 版权声明:本作品系原创,作者保留所有权利,未经作者允许,禁止转载和演绎。