摘要
本文实现了通过java爬取米库文学上的小说并保存到本地
本文实现了通过java爬取米库文学上的小说并保存到本地
在阅读前您需要知道以下事情:
1、非法爬取小说违法,请尊重知识产权
2、本文仅是为了向大家展示技术实现方案,不建议也不允许大家违法爬虫
3、慎重慎重
本文代码中爬取的小说链接为“从问题儿童开始的旅行”,将会爬取该网页中所有的章节,并保存到本地磁盘中。
首先在pom.xml中引入依赖:
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency> <dependency> <groupId>cn.wanghaomiao</groupId> <artifactId>JsoupXpath</artifactId> <version>2.4.3</version> </dependency> <!-- https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.8.0</version> </dependency>
主要实现类
package com.fenglun.shunfengche.pachong.miku; import java.io.File; import java.nio.charset.Charset; import org.apache.commons.io.FileUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class MainClass { // http://www.mikuwx.com/29/29022/ public static void main(String[] args) { MainClass mainClass = new MainClass(); String yuming = "http://www.mikuwx.com/29/29022/";// 小说网站域名 // String url = yuming + "10280813.shtml";// 小说主页 // String evZhangJie = mainClass.getEvZhangJie(yuming, // "10280813.shtml"); // System.out.println(evZhangJie); try { Document doc = Jsoup.connect(yuming).get(); // 文本名称 Elements elementsByClass = doc .getElementsByClass("bookTitle text-overflow"); String xiaoshuomingcheng = elementsByClass.text();// 小说名字 String xiaoshuoneirong = "《" + xiaoshuomingcheng + "》"; // 每章节链接 Elements zhangjies = doc .getElementsByClass("panel panel-default hidden-xs"); Elements aHrefTitles = zhangjies.get(0).getElementsByTag("a"); int size = aHrefTitles.size(); for (int i = 0; i < size; i++) { Element element = aHrefTitles.get(i); String evUrl = element.attr("href"); xiaoshuoneirong += "\n" + element.text();// 每章标题 // System.out.println(evUrl); int index = evUrl.lastIndexOf("/"); evUrl = evUrl.substring(index + 1); xiaoshuoneirong += "\n" + mainClass.getEvZhangJie(yuming, evUrl); } // System.out.println(xiaoshuoneirong); File file = new File("D:\\" + xiaoshuomingcheng + ".txt"); FileUtils.writeStringToFile( file, xiaoshuoneirong, Charset.defaultCharset(), false); } catch (Exception e) { e.printStackTrace(); } } /** * * 根据每节链接获取内容 * * @param url */ private String getEvZhangJie(String yuming, String url) { String content = ""; try { Document doc = Jsoup.connect(yuming + url).get(); Element zhangjie = doc.getElementById("htmlContent"); // 当夜内容 content = zhangjie.html(); content = content.replaceAll("<br>", "\n"); content = content.replaceAll(" ", " "); content = content.replaceAll("\n\n\n\n", "\n"); content = content .replaceAll("【米库文学 www.mikuwx.com】一秒记住,更新快,无弹窗,免费读!", ""); // 判断是否有下一页 Elements nextPages = doc.getElementsByClass("twwwext-center"); int size = nextPages.size(); String nextPageUrl = null; for (int i = 0; i < size; i++) { Elements elementsByTag = nextPages.get(i).getElementsByTag("a"); if (elementsByTag.size() == 3) { Element element = elementsByTag.get(2); String str = element.text(); if (str.contains("下一页")) { // 存在下一页,需要继续解析 nextPageUrl = element.attr("href"); break; } } } if (nextPageUrl != null) { System.out.println("存在下一页,需要解析:" + yuming + nextPageUrl); content += getEvZhangJie(yuming, nextPageUrl); } else { return content; } } catch (Exception e) { e.printStackTrace(); } return content; } }