scrapy 爬虫

🌳爬取小说正文 http://st.kanxshuo.com

🌳使用 vim 正则表达式处理连接修改视频字幕小说正文

🌳源码

🌳spider-kanxshuo.py

其中 ./utils 文件夹与 spider-kanxshuo.py 同级

import scrapy
from scrapy.selector import Selector
import json

from utils import mylog
logger = mylog.mylogger(__name__)

outDict = []
bookCode = '27846'
website = 'st.kanxshuo.com/'
class kanxshuoSpider(scrapy.Spider):
    name = 'kanxshuo-'+str(bookCode)
    start_urls = [f'http://{website}/book-{bookCode}-1.html',]

    def parse(self, response):
        global outDict

        urltitle = response.css('div.bm_h::text').get()       
        
       #windows 文件名不能把包含以下字符
        urltitle = urltitle.replace('\\', '').replace('/', '').replace(':', '').replace('*', '')\
        .replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '').replace('<br>', '')
        
        body = response.css('div.bookContent').get()
        body = body.replace('</div>', '').replace('<br>', '\n')\
            .replace('<div class="bookContent" id="fontzoom">', '')\
            .replace('<div id="a_d_4"><script type="text/javascript" src="/skin/a_728.js"></script>','')\
            .replace('<span id="a_d_1"> <script type="text/javascript" src="/skin/a_336.js"></script> </span>', '')\
            .replace('<span id="a_d_2"> <script type="text/javascript" src="/skin/a_728.js"></script> </span>', '')
        logger.info('当前页: {}'.format(urltitle))
        logger.debug('正文: {}'.format(body))
        outDict.append({
            '当前页': urltitle,
            '正文': body,
        })

        yield {
            '当前页': urltitle,
            '正文': body,
        }

        next_page = None
        for nextpage in response.css('div.bpages').css('a'):
            pagename = nextpage.css('a.pn::text').get()
            #logger.debug('{} '.format(pagename))
            if pagename == '下─页':
                next_page = nextpage.css('a::attr("href")').get()
                logger.debug(next_page)
                break
            #elif pagename != '尾页' and pagename != '首页' and pagename != '上一页' \
            #and int(pagename) and int(pagename) > 5000000:
            #    break
        if next_page is not None:
            yield response.follow(next_page, self.parse)
        else:
            outjsonName = "./txt-jsons/kanxshuo-%s-%s.json" % (urltitle, bookCode)
            #logger.debug("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
            logger.debug(outDict)
            outMDName = "./txt-markdown/kanxshuo-%s-%s.md" % (urltitle, bookCode)
            with open(outMDName, 'w+', newline='', encoding='utf-8') as ff:
                for item in outDict:
                    title =  item['当前页']
                    body = item['正文']
                    md_item = f'##### {title}' + '\n'
                    md_item += f'{body}' + '\n'
                    md_item += '\n---\n\n'
                    logger.info(md_item)
                    ff.writelines(md_item)
            #with open(outjsonName, "w+", encoding='utf-8') as f:
            #    # json.dump(dict_, f)  # 写为一行
            #    json.dump(outDict, f, indent=2, sort_keys=False, ensure_ascii=False)  # 写为多行

🌳./utils/mylog.py

import logging, coloredlogs

def mylogger(name):
    logger = logging.getLogger(name)

    level_styles = coloredlogs.DEFAULT_LEVEL_STYLES.copy()
    level_styles['debug'] = {'color': 'magenta'}
    level_styles['info'] = {'color': 'yellow'}
    level_styles['error'] = {'color': 'red'}
    level_styles['warning'] = {'color': 'blue'}
    coloredlogs.install(
        level="DEBUG",  # show only debug and above
        #fmt="%(asctime)s - %(hostname)s - %(name)s[%(process)d] -\
        #  %(pathname)s -%(filename)s - %(funcName)s - %(lineno)d - %(module)s - %(levelname)s - %(message)s",
        fmt="%(asctime)s - %(hostname)s - %(name)s[%(process)d] - %(filename)s::%(funcName)s::%(lineno)d - %(levelname)s - %(message)s",
 
        logger=logger,
        level_styles=level_styles,
    )
    return logger

    """
        logger.debug('Print log level：debug')
        logger.info('Print log level：info')
        logger.warning('Print log level：warning')
        logger.error('Print log level：error')
        logger.critical('Print log level：critical')
    """

🌳python 执行脚本

scrapy runspider spider-kanxshuo.py -o spider-kanxshuo.json -s FEED_EXPORT_ENCODING=UTF-8 -s LOG_FILE=spider-kanxshuo.log

🌳爬取 B站《庆余年》有声小说视频 (视频+小说=伴读书郎)

下载B站有声小说视频使用剪映识别视频字幕保存为小说正文
使用 vim 正则表达式处理爬取小说正文
《庆余年》小说正文

🌳爬虫本地数据库 NoSQL数据库

🌳TinyDB

🌳更快的json读写插件 BetterJSONStorage

from BetterJSONStorage import BetterJSONStorage
from tinydb import TinyDB, Query
from pathlib import Path

path = Path('./output/moviedb4.json')
moviedb = TinyDB(path, access_mode="r+", storage=BetterJSONStorage)

moviedb.default_table_name = 'movietable'
movietable = moviedb.table('movietable')

query = Query()

# 插入或者更新DB
ret = movietable.upsert(movieJson, query.name == name)

# 查询DB
if movietable.get(query.name == name) is None:
    pass

doc_id = movietable.get(query.name == name).doc_id

# 更新DB
movietable.update({'url':url}, doc_ids = [doc_id])
movietable.update({'screenshots':jsonResult['screenshots']}, doc_ids = [doc_id])
movietable.update({'cover':cover_url}, doc_ids = [doc_id])
logger.debug(movietable.get(doc_id=doc_id))

# 打印DB
def print_db():
    outMDName = "./db/actorDB.md"
    logger.info('outMDName total: {}'.format(outMDName))

    # 取数据写 markdown
    with open(outMDName, 'w+', newline='', encoding='utf-8') as ff:
        logger.info('outMDName: {}'.format(outMDName))
        with TinyDB(path, access_mode="r+", storage=BetterJSONStorage) as actorDB:
            logger.info('DB total: {}'.format(actorDB))
            actorTable = actorDB.table('actorTable')
            logger.info('DB table: {}'.format(actorTable))

            i = 0
            for item in actorTable.all():
                logger.info(item)

                md_item = print_item(item)
                i+=1

                ff.writelines(md_item)
                ff.flush()
                logger.info(md_item)
                logger.info(i)
            actorDB.close()
        ff.close()
    return

# 写入DB
def flush_db():
    outDict = []
    ...
    outDict.append(user_dict)
    ...
    path = Path(f'./db/actorDB.json')
    with TinyDB(path, access_mode="r+", storage=BetterJSONStorage) as actorDB:
        actorDB.default_table_name = 'actorTable'
        actorTable = actorDB.table('actorTable')
        query = Query()
        for actor in outDict:
            logger.info('actor:::::::::::{}'.format(actor))
            actorTable.upsert(actor, query.name == actor.name'])
        logger.info('DB total: {}'.format(actorDB))
        actorDB.close()

Here is a footnote reference,[^1] and another.[^longnote]

🌳Endnotes

[^1]: Here is the footnote.
[^longnote]: Here’s one with multiple blocks.