Python爬取51cto数据并存入MySQL方法详解

发布时间：2025-09-15 点击：21

【相关学习推荐：python教程】
实验环境
1.安装python 3.7
2.安装requests, bs4，pymysql 模块
实验步骤1.安装环境及模块
可参考https://www.jb51.net/article/194104.htm
2.编写代码
# 51cto 博客页面数据插入mysql数据库# 导入模块import reimport bs4import pymysqlimport requests# 连接数据库账号密码db = pymysql.connect(host='172.171.13.229', user='root', passwd='abc123', db='test', port=3306, charset='utf8')# 获取游标cursor = db.cursor()def open_url(url): # 连接模拟网页访问 headers = { 'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) ' 'chrome/57.0.2987.98 safari/537.36'} res = requests.get(url, headers=headers) return res# 爬取网页内容def find_text(res): soup = bs4.beautifulsoup(res.text, 'html.parser') # 博客名 titles = [] targets = soup.find_all("a", class_="tit") for each in targets: each = each.text.strip() if "置顶" in each: each = each.split(' ')[0] titles.append(each) # 阅读量 reads = [] read1 = soup.find_all("p", class_="read fl on") read2 = soup.find_all("p", class_="read fl") for each in read1: reads.append(each.text) for each in read2: reads.append(each.text) # 评论数 comment = [] targets = soup.find_all("p", class_='comment fl') for each in targets: comment.append(each.text) # 收藏 collects = [] targets = soup.find_all("p", class_='collect fl') for each in targets: collects.append(each.text) # 发布时间 dates=[] targets = soup.find_all("a", class_='time fl') for each in targets: each = each.text.split('：')[1] dates.append(each) # 插入sql 语句 sql = """insert into blog (blog_title,read_number,comment_number, collect, dates) values( '%s', '%s', '%s', '%s', '%s');""" # 替换页面 \\\\xa0 for titles, reads, comment, collects, dates in zip(titles, reads, comment, collects, dates): reads = re.sub('\\\\s', '', reads) comment = re.sub('\\\\s', '', comment) collects = re.sub('\\\\s', '', collects) cursor.execute(sql % (titles, reads, comment, collects，dates)) db.commit() pass# 统计总页数def find_depth(res): soup = bs4.beautifulsoup(res.text, 'html.parser') depth = soup.find('li', class_='next').previous_sibling.previous_sibling.text return int(depth)# 主函数def main(): host = "https://blog.51cto.com/13760351" res = open_url(host) # 打开首页链接 depth = find_depth(res) # 获取总页数 # 爬取其他页面信息 for i in range(1, depth 1): url = host '/p' str(i) # 完整链接 res = open_url(url) # 打开其他链接 find_text(res) # 爬取数据 # 关闭游标 cursor.close() # 关闭数据库连接 db.close()if __name__ == '__main__': main()3..mysql创建对应的表
create table `blog` ( `row_id` int(11) not null auto_increment comment '主键', `blog_title` varchar(52) default null comment '博客标题', `read_number` varchar(26) default null comment '阅读数量', `comment_number` varchar(16) default null comment '评论数量', `collect` varchar(16) default null comment '收藏数量', `dates` varchar(16) default null comment '发布日期', primary key (`row_id`)) engine=innodb auto_increment=1 default charset=utf8;import reimport bs4import pymysqlimport requests# 连接数据库db = pymysql.connect(host='172.171.13.229', user='root', passwd='abc123', db='test', port=3306, charset='utf8')# 获取游标cursor = db.cursor()def open_url(url): # 连接模拟网页访问 headers = { 'user-agent': 'mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) ' 'chrome/57.0.2987.98 safari/537.36'} res = requests.get(url, headers=headers) return res# 爬取网页内容def find_text(res):

在网站优化中，导航有多少中类型呢？
微信推送怎么附上文件_怎样在微信公众号上传附件图文步骤
电脑中打开右键菜单显示问号等乱码如何解决
小鸟云服务器可以用ssd硬盘吗
腾讯企业邮箱、企业邮箱--功能优势
腾讯云轻量服务器备案授权码在哪
论运营的大局观：强势主动与自察思考
怎么区别vps服务器

上一篇：购买一个云服务器吗

下一篇：海外商标注册补贴