scrapy 爬虫
保存为markdown 格式 《A不在现场》前11章
使用 vim 正则表达式 处理连接修改 视频字幕小说正文
《不在现场》11章到最后一章
源码
spider-kanxshuo.py
其中 ./utils
文件夹 与 spider-kanxshuo.py
同级
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| import scrapy from scrapy.selector import Selector import json
from utils import mylog logger = mylog.mylogger(__name__)
outDict = [] bookCode = '27846' website = 'st.kanxshuo.com/' class kanxshuoSpider(scrapy.Spider): name = 'kanxshuo-'+str(bookCode) start_urls = [f'http://{website}/book-{bookCode}-1.html',]
def parse(self, response): global outDict
urltitle = response.css('div.bm_h::text').get() urltitle = urltitle.replace('\\', '').replace('/', '').replace(':', '').replace('*', '')\ .replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '').replace('<br>', '') body = response.css('div.bookContent').get() body = body.replace('</div>', '').replace('<br>', '\n')\ .replace('<div class="bookContent" id="fontzoom">', '')\ .replace('<div id="a_d_4"><script type="text/javascript" src="/skin/a_728.js"></script>','')\ .replace('<span id="a_d_1"> <script type="text/javascript" src="/skin/a_336.js"></script> </span>', '')\ .replace('<span id="a_d_2"> <script type="text/javascript" src="/skin/a_728.js"></script> </span>', '') logger.info('当前页: {}'.format(urltitle)) logger.debug('正文: {}'.format(body)) outDict.append({ '当前页': urltitle, '正文': body, })
yield { '当前页': urltitle, '正文': body, }
next_page = None for nextpage in response.css('div.bpages').css('a'): pagename = nextpage.css('a.pn::text').get() if pagename == '下─页': next_page = nextpage.css('a::attr("href")').get() logger.debug(next_page) break if next_page is not None: yield response.follow(next_page, self.parse) else: outjsonName = "./txt-jsons/kanxshuo-%s-%s.json" % (urltitle, bookCode) logger.debug(outDict) outMDName = "./txt-markdown/kanxshuo-%s-%s.md" % (urltitle, bookCode) with open(outMDName, 'w+', newline='', encoding='utf-8') as ff: for item in outDict: title = item['当前页'] body = item['正文'] md_item = f'##### {title}' + '\n' md_item += f'{body}' + '\n' md_item += '\n---\n\n' logger.info(md_item) ff.writelines(md_item)
|
./utils/mylog.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
| import logging, coloredlogs
def mylogger(name): logger = logging.getLogger(name)
level_styles = coloredlogs.DEFAULT_LEVEL_STYLES.copy() level_styles['debug'] = {'color': 'magenta'} level_styles['info'] = {'color': 'yellow'} level_styles['error'] = {'color': 'red'} level_styles['warning'] = {'color': 'blue'} coloredlogs.install( level="DEBUG", fmt="%(asctime)s - %(hostname)s - %(name)s[%(process)d] - %(filename)s::%(funcName)s::%(lineno)d - %(levelname)s - %(message)s", logger=logger, level_styles=level_styles, ) return logger
""" logger.debug('Print log level:debug') logger.info('Print log level:info') logger.warning('Print log level:warning') logger.error('Print log level:error') logger.critical('Print log level:critical') """
|
python 执行脚本
scrapy runspider spider-kanxshuo.py -o spider-kanxshuo.json -s FEED_EXPORT_ENCODING=UTF-8 -s LOG_FILE=spider-kanxshuo.log
爬取 B站 《庆余年》 有声小说视频 (视频+小说=伴读书郎)
- 下载B站 有声小说视频 使用剪映 识别视频字幕 保存为小说正文
- 使用 vim 正则表达式 处理 爬取小说正文
- 《庆余年》小说正文
爬虫本地数据库 NoSQL数据库
TinyDB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| from BetterJSONStorage import BetterJSONStorage from tinydb import TinyDB, Query from pathlib import Path
path = Path('./output/moviedb4.json') moviedb = TinyDB(path, access_mode="r+", storage=BetterJSONStorage)
moviedb.default_table_name = 'movietable' movietable = moviedb.table('movietable')
query = Query()
ret = movietable.upsert(movieJson, query.name == name)
if movietable.get(query.name == name) is None: pass
doc_id = movietable.get(query.name == name).doc_id
movietable.update({'url':url}, doc_ids = [doc_id]) movietable.update({'screenshots':jsonResult['screenshots']}, doc_ids = [doc_id]) movietable.update({'cover':cover_url}, doc_ids = [doc_id]) logger.debug(movietable.get(doc_id=doc_id))
def print_db(): outMDName = "./db/actorDB.md" logger.info('outMDName total: {}'.format(outMDName))
with open(outMDName, 'w+', newline='', encoding='utf-8') as ff: logger.info('outMDName: {}'.format(outMDName)) with TinyDB(path, access_mode="r+", storage=BetterJSONStorage) as actorDB: logger.info('DB total: {}'.format(actorDB)) actorTable = actorDB.table('actorTable') logger.info('DB table: {}'.format(actorTable))
i = 0 for item in actorTable.all(): logger.info(item)
md_item = print_item(item) i+=1
ff.writelines(md_item) ff.flush() logger.info(md_item) logger.info(i) actorDB.close() ff.close() return
def flush_db(): outDict = [] ... outDict.append(user_dict) ... path = Path(f'./db/actorDB.json') with TinyDB(path, access_mode="r+", storage=BetterJSONStorage) as actorDB: actorDB.default_table_name = 'actorTable' actorTable = actorDB.table('actorTable') query = Query() for actor in outDict: logger.info('actor:::::::::::{}'.format(actor)) actorTable.upsert(actor, query.name == actor.name']) logger.info('DB total: {}'.format(actorDB)) actorDB.close()
|
Here is a footnote reference,[^1] and another.[^longnote]
Endnotes
[^1]: Here is the footnote.
[^longnote]: Here’s one with multiple blocks.