from bs4 import BeautifulSoup
from langconv import *
import re
from urllib import request
from selenium import webdriver
import multiprocessing
import os
import time
from concurrent.futures import ThreadPoolExecutor
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36”}
rootPath = ‘E://h_novel/’
tp=ThreadPoolExecutor(max_workers=8)
if not os.path.exists(rootPath):
os.mkdir(rootPath)
def TraditionalToSimplified(line): # 繁体转简体
line = Converter(“zh-hans”).convert(line)
return line
def downloadfiction(url, fictionName, savePath):
if os.path.exists(savePath+'{}.txt’.format(fictionName)):
print(‘{}已存在n’.format(fictionName))
return
pg = request.Request(url, headers=headers)
hml = request.urlopen(pg).read().decode(‘utf-8’, ‘ignore’)
sup = BeautifulSoup(hml, ‘html.parser’)
dv = sup.find(id=”novel_content_txtsize”)
text = str(dv)
text = re.split(‘<br/>|<br>|n’, text)
f = open(savePath+'{}.txt’.format(fictionName), ‘w’, encoding=’utf-8′)
for i in range(1, len(text)-1):
f.write(TraditionalToSimplified(text[i])+’n’)
f.close()
print(‘{}下载完成n’.format(fictionName))
def kindsdown(kindfurl, kindname):
savepath = rootPath+kindname+’/’
if not os.path.exists(savepath):
os.mkdir(savepath)
driver = webdriver.chrome.webdriver.WebDriver()
driver.get(kindfurl)
driver.minimize_window()
time.sleep(5.0)
try:
lefts = driver.find_element_by_class_name(‘novel_left’).find_elements_by_tag_name(‘a’)
except:
driver.close()
lefts=[]
for obj in lefts:
novelurl = obj.get_attribute(‘href’)
novelName = obj.text
tp.submit(downloadfiction,novelurl, novelName,savepath)
try:
rights = driver.find_element_by_class_name(‘novel_right’).find_elements_by_tag_name(‘a’)
except:
driver.close()
rights=[]
for obj in rights:
novelurl = obj.get_attribute(‘href’)
novelName = TraditionalToSimplified(obj.text)
tp.submit(downloadfiction,novelurl, novelName,savepath)
driver.close()
time.sleep(5)
if __name__ == “__main__”:
pools = multiprocessing.Pool(processes=3)
U=[
[‘http://18av.mm-cg.com/students_on_campus.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E5%AD%B8%E7%94%9F%E6%A0%A1%E5%9C%92_’,’.htm’,59,’学生校园’],
[‘http://18av.mm-cg.com/workplace_passion.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E8%81%B7%E5%A0%B4%E6%BF%80%E6%83%85_’,’.htm’,145,’职场激情’],
[‘http://18av.mm-cg.com/experience_story.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E7%B6%93%E9%A9%97%E6%95%85%E4%BA%8B_’,’.htm’,80,’经验故事’],
[‘http://18av.mm-cg.com/violent_abuse.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E6%9A%B4%E5%8A%9B%E8%99%90%E5%BE%85_’,’.htm’,34,’暴力虐待’],
[‘http://18av.mm-cg.com/not_london_love.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E4%B8%8D%E5%80%AB%E6%88%80%E6%83%85_’,’.htm’,169,’不伦恋情’],
[‘http://18av.mm-cg.com/groups_companion.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E7%BE%A4%E9%AB%94%E6%8F%9B%E4%BC%B4_’,’.htm’,15,’群体换伴’],
[‘http://18av.mm-cg.com/wife_mature.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E4%BA%BA%E5%A6%BB%E7%86%9F%E5%A5%B3_’,’.htm’,158,’人妻熟女’],
[‘http://18av.mm-cg.com/sciencefiction.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E7%A7%91%E5%AD%B8%E5%B9%BB%E6%83%B3_’,’.htm’,21,’科学幻想’],
[‘http://18av.mm-cg.com/other_stories.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E5%85%B6%E4%BB%96%E6%95%85%E4%BA%8B_’,’.htm’,15,’其他故事’],
[‘http://18av.mm-cg.com/other_fantasy.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E7%8E%84%E5%B9%BB%E4%BB%99%E4%BF%A0_’,’.htm’,35,’玄幻仙侠’],
[‘http://18av.mm-cg.com/anime_modification.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E5%8B%95%E6%BC%AB%E4%BF%AE%E6%94%B9_’,’.htm’,5,’动漫修改’]
[‘http://18av.mm-cg.com/long_serial.html’,’http://18av.mm-cg.com/serch_novel/%E5%B0%8F%E8%AA%AA_%E9%95%B7%E7%AF%87%E9%80%A3%E8%BC%89_’,’.htm’,9,’长篇连载’]
]
for u in U:
pools.apply_async(kindsdown,(u[1],u[4]))
for i in range(2,u[3]):
pools.apply_async(kindsdown,(u[1]+str(i)+u[2],u[4]))
pools.close()
pools.join()
time.sleep(3600)#等待线程结束