1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| import scrapy from scrapy.selector import Selector import json
from utils import mylog logger = mylog.mylogger(__name__)
outDict = [] bookCode = '27846' website = 'st.kanxshuo.com/' class kanxshuoSpider(scrapy.Spider): name = 'kanxshuo-'+str(bookCode) start_urls = [f'http://{website}/book-{bookCode}-1.html',]
def parse(self, response): global outDict
urltitle = response.css('div.bm_h::text').get() urltitle = urltitle.replace('\\', '').replace('/', '').replace(':', '').replace('*', '')\ .replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '').replace('<br>', '') body = response.css('div.bookContent').get() body = body.replace('</div>', '').replace('<br>', '\n')\ .replace('<div class="bookContent" id="fontzoom">', '')\ .replace('<div id="a_d_4"><script type="text/javascript" src="/skin/a_728.js"></script>','')\ .replace('<span id="a_d_1"> <script type="text/javascript" src="/skin/a_336.js"></script> </span>', '')\ .replace('<span id="a_d_2"> <script type="text/javascript" src="/skin/a_728.js"></script> </span>', '') logger.info('当前页: {}'.format(urltitle)) logger.debug('正文: {}'.format(body)) outDict.append({ '当前页': urltitle, '正文': body, })
yield { '当前页': urltitle, '正文': body, }
next_page = None for nextpage in response.css('div.bpages').css('a'): pagename = nextpage.css('a.pn::text').get() if pagename == '下─页': next_page = nextpage.css('a::attr("href")').get() logger.debug(next_page) break if next_page is not None: yield response.follow(next_page, self.parse) else: outjsonName = "./txt-jsons/kanxshuo-%s-%s.json" % (urltitle, bookCode) logger.debug(outDict) outMDName = "./txt-markdown/kanxshuo-%s-%s.md" % (urltitle, bookCode) with open(outMDName, 'w+', newline='', encoding='utf-8') as ff: for item in outDict: title = item['当前页'] body = item['正文'] md_item = f'##### {title}' + '\n' md_item += f'{body}' + '\n' md_item += '\n---\n\n' logger.info(md_item) ff.writelines(md_item)
|