Python多线程爬虫与多种数据存储方式实现(Python爬虫实战2)

1. 多进程爬虫

  对于数据量较大的爬虫,对数据的处理要求较高时,可以采用python多进程或多线程的机制完成,多进程是指分配多个CPU处理程序,同一时刻只有一个CPU在工作,多线程是指进程内部有多个类似”子进程”同时在协同工作。python中有多种多个模块可完成多进程和多线程的工作,此处此用multiprocessing模块完成多线程爬虫,测试过程中发现,由于站点具有反爬虫机制,当url地址和进程数目较多时,爬虫会报错。

Python多线程爬虫与多种数据存储方式实现(Python爬虫实战2),python爬虫

2. 代码内容

#!/usr/bin/python
#_*_ coding:utf _*_

import re
import time 
import requests
from multiprocessing import Pool

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 time.sleep(1)
 return duanzi_list

def normal_scapper(url_lists):
 '''
 定义调用函数,使用普通的爬虫函数爬取数据
 '''
 begin_time = time.time()
 for url in url_lists:
  scrap_qiushi_info(url)
 end_time = time.time()
 print "普通爬虫一共耗费时长:%f" % (end_time - begin_time)

def muti_process_scapper(url_lists,process_num=2):
 '''
 定义多进程爬虫调用函数,使用mutiprocessing模块爬取web数据
 '''
 begin_time = time.time()
 pool = Pool(processes=process_num)
 pool.map(scrap_qiushi_info,url_lists)
 end_time = time.time()
 print "%d个进程爬虫爬取所耗费时长为:%s" % (process_num,(end_time - begin_time))

def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 normal_scapper(url_lists)
 muti_process_scapper(url_lists,process_num=2)


if __name__ == "__main__":
 main()

1. 多进程爬虫

  对于数据量较大的爬虫,对数据的处理要求较高时,可以采用python多进程或多线程的机制完成,多进程是指分配多个CPU处理程序,同一时刻只有一个CPU在工作,多线程是指进程内部有多个类似”子进程”同时在协同工作。python中有多种多个模块可完成多进程和多线程的工作,此处此用multiprocessing模块完成多线程爬虫,测试过程中发现,由于站点具有反爬虫机制,当url地址和进程数目较多时,爬虫会报错。

 3. 爬取的数据存入到MongoDB数据库

#!/usr/bin/python
#_*_ coding:utf _*_

import re
import time 
import json
import requests
import pymongo
from multiprocessing import Pool

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_mongo(datas):
 '''
 @datas: 需要插入到mongoDB的数据,封装为字典,通过遍历的方式将数据插入到mongoDB中,insert_one()表示一次插入一条数据
 '''
 client = pymongo.MongoClient('localhost',27017)
 duanzi = client['duanzi_db']
 duanzi_info = duanzi['duanzi_info']
 for data in datas:
  duanzi_info.insert_one(data)

def query_data_from_mongo():
 '''
 查询mongoDB中的数据
 '''
 client = pymongo.MongoClient('localhost',27017)['duanzi_db']['duanzi_info']
 for data in client.find():
  print data 
 print "一共查询到%d条数据" % (client.find().count())


def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_mongo(duanzi_list)

if __name__ == "__main__":
 main()
 #query_data_from_mongo()

2. 代码内容

#!/usr/bin/python
#_*_ coding:utf _*_

import re
import time 
import requests
from multiprocessing import Pool

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 time.sleep(1)
 return duanzi_list

def normal_scapper(url_lists):
 '''
 定义调用函数,使用普通的爬虫函数爬取数据
 '''
 begin_time = time.time()
 for url in url_lists:
  scrap_qiushi_info(url)
 end_time = time.time()
 print "普通爬虫一共耗费时长:%f" % (end_time - begin_time)

def muti_process_scapper(url_lists,process_num=2):
 '''
 定义多进程爬虫调用函数,使用mutiprocessing模块爬取web数据
 '''
 begin_time = time.time()
 pool = Pool(processes=process_num)
 pool.map(scrap_qiushi_info,url_lists)
 end_time = time.time()
 print "%d个进程爬虫爬取所耗费时长为:%s" % (process_num,(end_time - begin_time))

def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 normal_scapper(url_lists)
 muti_process_scapper(url_lists,process_num=2)


if __name__ == "__main__":
 main()

 4. 插入至MySQL数据库

  将爬虫获取的数据插入到关系性数据库MySQL数据库中作为永久数据存储,首先需要在MySQL数据库中创建库和表,如下:

1. 创建库
MariaDB [(none)]> create database qiushi;
Query OK, 1 row affected (0.00 sec)

2. 使用库
MariaDB [(none)]> use qiushi;
Database changed

3. 创建表格
MariaDB [qiushi]> create table qiushi_info(id int(32) unsigned primary key auto_increment,username varchar(64) not null,level int default 0,laugh_count int default 0,comment_count int default 0,content text default '')engine=InnoDB charset='UTF8';
Query OK, 0 rows affected, 1 warning (0.06 sec)

MariaDB [qiushi]> show create table qiushi_info;
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Table       | Create Table                                                                                                                                                                                                                                                                                            |
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| qiushi_info | CREATE TABLE `qiushi_info` (
  `id` int(32) unsigned NOT NULL AUTO_INCREMENT,
  `username` varchar(64) NOT NULL,
  `level` int(11) DEFAULT '0',
  `laugh_count` int(11) DEFAULT '0',
  `comment_count` int(11) DEFAULT '0',
  `content` text,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 |
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (0.00 sec)

 写入到MySQL数据库中的代码如下:

#!/usr/bin/python
#_*_ coding:utf _*_
#blog:http://www.cnblogs.com/cloudlab/

import re
import time 
import pymysql
import requests

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_mysql(datas):
 '''
 @params: datas,将爬虫获取的数据写入到MySQL数据库中
 '''
 try:
  conn = pymysql.connect(host='localhost',port=3306,user='root',password='',db='qiushi',charset='utf8')
  cursor = conn.cursor(pymysql.cursors.DictCursor)
  for data in datas:
   data_list = (data['username'],int(data['level']),int(data['laugh_count']),int(data['comment_count']),data['content'])
   sql = "INSERT INTO qiushi_info(username,level,laugh_count,comment_count,content) VALUES('%s',%s,%s,%s,'%s')" %(data_list)
   cursor.execute(sql)
   conn.commit()
 except Exception as e:
  print e
 cursor.close()
 conn.close()


def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_mysql(duanzi_list)

if __name__ == "__main__":
 main()

 3. 爬取的数据存入到MongoDB数据库

#!/usr/bin/python
#_*_ coding:utf _*_

import re
import time 
import json
import requests
import pymongo
from multiprocessing import Pool

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_mongo(datas):
 '''
 @datas: 需要插入到mongoDB的数据,封装为字典,通过遍历的方式将数据插入到mongoDB中,insert_one()表示一次插入一条数据
 '''
 client = pymongo.MongoClient('localhost',27017)
 duanzi = client['duanzi_db']
 duanzi_info = duanzi['duanzi_info']
 for data in datas:
  duanzi_info.insert_one(data)

def query_data_from_mongo():
 '''
 查询mongoDB中的数据
 '''
 client = pymongo.MongoClient('localhost',27017)['duanzi_db']['duanzi_info']
 for data in client.find():
  print data 
 print "一共查询到%d条数据" % (client.find().count())


def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_mongo(duanzi_list)

if __name__ == "__main__":
 main()
 #query_data_from_mongo()

 5. 将爬虫数据写入到CSV文件

  CSV文件是以逗号,形式分割的文本读写方式,能够通过纯文本或者Excel方式读取,是一种常见的数据存储方式,此处将爬取的数据存入到CSV文件内。

将数据存入到CSV文件代码内容如下:

#!/usr/bin/python
#_*_ coding:utf _*_
#blog:http://www.cnblogs.com/cloudlab/

import re
import csv
import time 
import requests

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_csv(datas,filename):
 '''
 @datas: 需要写入csv文件的数据内容,是一个列表
 @params:filename,需要写入到目标文件的csv文件名
 '''
 with file(filename,'w+') as f:
  writer = csv.writer(f)
  writer.writerow(('username','level','laugh_count','comment_count','content'))
  for data in datas:
   writer.writerow((data['username'],data['level'],data['laugh_count'],data['comment_count'],data['content']))

def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_csv(duanzi_list,'/root/duanzi_info.csv')

if __name__ == "__main__":
 main()

 4. 插入至MySQL数据库

  将爬虫获取的数据插入到关系性数据库MySQL数据库中作为永久数据存储,首先需要在MySQL数据库中创建库和表,如下:

1. 创建库
MariaDB [(none)]> create database qiushi;
Query OK, 1 row affected (0.00 sec)

2. 使用库
MariaDB [(none)]> use qiushi;
Database changed

3. 创建表格
MariaDB [qiushi]> create table qiushi_info(id int(32) unsigned primary key auto_increment,username varchar(64) not null,level int default 0,laugh_count int default 0,comment_count int default 0,content text default '')engine=InnoDB charset='UTF8';
Query OK, 0 rows affected, 1 warning (0.06 sec)

MariaDB [qiushi]> show create table qiushi_info;
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Table       | Create Table                                                                                                                                                                                                                                                                                            |
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| qiushi_info | CREATE TABLE `qiushi_info` (
  `id` int(32) unsigned NOT NULL AUTO_INCREMENT,
  `username` varchar(64) NOT NULL,
  `level` int(11) DEFAULT '0',
  `laugh_count` int(11) DEFAULT '0',
  `comment_count` int(11) DEFAULT '0',
  `content` text,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 |
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
1 row in set (0.00 sec)

 写入到MySQL数据库中的代码如下:

#!/usr/bin/python
#_*_ coding:utf _*_
#blog:http://www.cnblogs.com/cloudlab/

import re
import time 
import pymysql
import requests

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_mysql(datas):
 '''
 @params: datas,将爬虫获取的数据写入到MySQL数据库中
 '''
 try:
  conn = pymysql.connect(host='localhost',port=3306,user='root',password='',db='qiushi',charset='utf8')
  cursor = conn.cursor(pymysql.cursors.DictCursor)
  for data in datas:
   data_list = (data['username'],int(data['level']),int(data['laugh_count']),int(data['comment_count']),data['content'])
   sql = "INSERT INTO qiushi_info(username,level,laugh_count,comment_count,content) VALUES('%s',%s,%s,%s,'%s')" %(data_list)
   cursor.execute(sql)
   conn.commit()
 except Exception as e:
  print e
 cursor.close()
 conn.close()


def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_mysql(duanzi_list)

if __name__ == "__main__":
 main()

 6. 将爬取数据写入到文本文件中

#!/usr/bin/python
#_*_ coding:utf _*_
#blog:http://www.cnblogs.com/cloudlab/

import re
import csv
import time 
import requests

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_files(datas,filename):
 '''
 定义数据存入写文件的函数
 @params:datas需要写入的数据
 @filename:将数据写入到指定的文件名
 '''
 print "开始写入文件.."
 with file(filename,'w+') as f:
  f.write("用户名" + "\t" + "用户等级" + "\t" + "笑话数" + "\t" + "评论数" + "\t" + "段子内容" + "\n")
  for data in datas:
   f.write(data['username'] + "\t" + \
    data['level'] + "\t" + \
    data['laugh_count'] + "\t" + \
    data['comment_count'] + "\t" + \
    data['content'] + "\n" + "\n"
   )

def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_files(duanzi_list,'/root/duanzi.txt')

if __name__ == "__main__":
 main()

 

 5. 将爬虫数据写入到CSV文件

  CSV文件是以逗号,形式分割的文本读写方式,能够通过纯文本或者Excel方式读取,是一种常见的数据存储方式,此处将爬取的数据存入到CSV文件内。

88bf必发娱乐,将数据存入到CSV文件代码内容如下:

#!/usr/bin/python
#_*_ coding:utf _*_
#blog:http://www.cnblogs.com/cloudlab/

import re
import csv
import time 
import requests

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_csv(datas,filename):
 '''
 @datas: 需要写入csv文件的数据内容,是一个列表
 @params:filename,需要写入到目标文件的csv文件名
 '''
 with file(filename,'w+') as f:
  writer = csv.writer(f)
  writer.writerow(('username','level','laugh_count','comment_count','content'))
  for data in datas:
   writer.writerow((data['username'],data['level'],data['laugh_count'],data['comment_count'],data['content']))

def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_csv(duanzi_list,'/root/duanzi_info.csv')

if __name__ == "__main__":
 main()

 6. 将爬取数据写入到文本文件中

#!/usr/bin/python
#_*_ coding:utf _*_
#blog:http://www.cnblogs.com/cloudlab/

import re
import csv
import time 
import requests

duanzi_list = []

def get_web_html(url):
 '''
 @params:获取url地址web站点的html数据
 '''
 headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0"}
 try:
  req = requests.get(url,headers=headers)
  if req.status_code == 200:
   response = req.text.encode('utf8')
 except Exception as e:
  print e
 return response

def scrap_qiushi_info(url):
 '''
 @params:url,获取段子数据信息
 '''
 html = get_web_html(url)
 usernames = re.findall(r'<h2>(.*?)</h2>',html,re.S|re.M)
 levels = re.findall('<div class="articleGender \w*Icon">(\d+)</div>',html,re.S|re.M)
 laugh_counts = re.findall('.*?<i class="number">(\d+)</i>',html,re.S|re.M)
 comment_counts = re.findall('<i class="number">(\d+)</i> 评论',html,re.S|re.M)
 contents = re.findall('<div class="content">.*?(.*?)',html,re.S|re.M)
 for username,level,laugh_count,comment_count,content in zip(usernames,levels,laugh_counts,comment_counts,contents):
  information = {
   "username": username.strip(),
   "level": level.strip(),
   "laugh_count": laugh_count.strip(),
   "comment_count": comment_count.strip(),
   "content": content.strip()
  }
  duanzi_list.append(information)
 return duanzi_list

def write_into_files(datas,filename):
 '''
 定义数据存入写文件的函数
 @params:datas需要写入的数据
 @filename:将数据写入到指定的文件名
 '''
 print "开始写入文件.."
 with file(filename,'w+') as f:
  f.write("用户名" + "\t" + "用户等级" + "\t" + "笑话数" + "\t" + "评论数" + "\t" + "段子内容" + "\n")
  for data in datas:
   f.write(data['username'] + "\t" + \
    data['level'] + "\t" + \
    data['laugh_count'] + "\t" + \
    data['comment_count'] + "\t" + \
    data['content'] + "\n" + "\n"
   )

def main():
 '''
 定义main()函数,程序入口,通过列表推倒式获取url地址,调用爬虫函数
 '''
 url_lists = ['https://www.qiushibaike.com/text/page/{}'.format(i) for i in range(1,11)]
 for url in url_lists:
  scrap_qiushi_info(url) 
  time.sleep(1)
 write_into_files(duanzi_list,'/root/duanzi.txt')

if __name__ == "__main__":
 main()

 

http://www.bkjia.com/Pythonjc/1305506.htmlwww.bkjia.comtruehttp://www.bkjia.com/Pythonjc/1305506.htmlTechArticlePython多线程爬虫与多种数据存储方式实现(Python爬虫实战2),python爬虫

  1. 多进程爬虫 对于数据量较大的爬虫,对数据的处理要求较高时,可…