【相关学习推荐:python教程】
实验环境
1.安装python 3.7
2.安装requests, bs4,pymysql 模块
实验步骤1.安装环境及模块
可参考https://www.jb51.net/article/194104.htm
2.编写代码
# 51cto 博客页面数据插入mysql数据库# 导入模块import reimport bs4import pymysqlimport requests# 连接数据库账号密码db = pymysql.connect(host='172.171.13.229', user='root', passwd='abc123', db='test', port=3306, charset='utf8')# 获取游标cursor = db.cursor()def open_url(url): # 连接模拟网页访问 headers = { 'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) ' 'chrome/57.0.2987.98 safari/537.36'} res = requests.get(url, headers=headers) return res# 爬取网页内容def find_text(res): soup = bs4.beautifulsoup(res.text, 'html.parser') # 博客名 titles = [] targets = soup.find_all("a", class_="tit") for each in targets: each = each.text.strip() if "置顶" in each: each = each.split(' ')[0] titles.append(each) # 阅读量 reads = [] read1 = soup.find_all("p", class_="read fl on") read2 = soup.find_all("p", class_="read fl") for each in read1: reads.append(each.text) for each in read2: reads.append(each.text) # 评论数 comment = [] targets = soup.find_all("p", class_='comment fl') for each in targets: comment.append(each.text) # 收藏 collects = [] targets = soup.find_all("p", class_='collect fl') for each in targets: collects.append(each.text) # 发布时间 dates=[] targets = soup.find_all("a", class_='time fl') for each in targets: each = each.text.split(':')[1] dates.append(each) # 插入sql 语句 sql = """insert into blog (blog_title,read_number,comment_number, collect, dates) values( '%s', '%s', '%s', '%s', '%s');""" # 替换页面 \\\\xa0 for titles, reads, comment, collects, dates in zip(titles, reads, comment, collects, dates): reads = re.sub('\\\\s', '', reads) comment = re.sub('\\\\s', '', comment) collects = re.sub('\\\\s', '', collects) cursor.execute(sql % (titles, reads, comment, collects,dates)) db.commit() pass# 统计总页数def find_depth(res): soup = bs4.beautifulsoup(res.text, 'html.parser') depth = soup.find('li', class_='next').previous_sibling.previous_sibling.text return int(depth)# 主函数def main(): host = "https://blog.51cto.com/13760351" res = open_url(host) # 打开首页链接 depth = find_depth(res) # 获取总页数 # 爬取其他页面信息 for i in range(1, depth 1): url = host '/p' str(i) # 完整链接 res = open_url(url) # 打开其他链接 find_text(res) # 爬取数据 # 关闭游标 cursor.close() # 关闭数据库连接 db.close()if __name__ == '__main__': main()3..mysql创建对应的表
create table `blog` ( `row_id` int(11) not null auto_increment comment '主键', `blog_title` varchar(52) default null comment '博客标题', `read_number` varchar(26) default null comment '阅读数量', `comment_number` varchar(16) default null comment '评论数量', `collect` varchar(16) default null comment '收藏数量', `dates` varchar(16) default null comment '发布日期', primary key (`row_id`)) engine=innodb auto_increment=1 default charset=utf8;import reimport bs4import pymysqlimport requests# 连接数据库db = pymysql.connect(host='172.171.13.229', user='root', passwd='abc123', db='test', port=3306, charset='utf8')# 获取游标cursor = db.cursor()def open_url(url): # 连接模拟网页访问 headers = { 'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) ' 'chrome/57.0.2987.98 safari/537.36'} res = requests.get(url, headers=headers) return res# 爬取网页内容def find_text(res):
在网站优化中,导航有多少中类型呢?微信推送怎么附上文件_怎样在微信公众号上传附件图文步骤电脑中打开右键菜单显示问号等乱码如何解决小鸟云服务器可以用ssd硬盘吗腾讯企业邮箱、企业邮箱--功能优势腾讯云轻量服务器备案授权码在哪论运营的大局观:强势主动与自察思考怎么区别vps服务器