package com.yoyo.clourmovice; import com.yoyo.clourmovice.pipeline.BugPagePipeLine; import com.yoyo.clourmovice.pipeline.DownloadUriPipeLine; import com.yoyo.clourmovice.processor.BugPageProcessor; import com.yoyo.clourmovice.processor.DownloadUriProcessor; import org.apache.commons.io.FileUtils; import us.codecraft.webmagic.Spider; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.util.List; public class ClourMoviceApplication{ static String domain = "http://www.mmbabe.com/"; private static void loadingPageUrl() { int pageSize = 1138; for (int index = 1; index <= pageSize; index++) { Spider.create(new BugPageProcessor()) .addUrl("http://www.mmbabe.com/forumdisplay.php?fid=41&page=" + index) .addPipeline(new BugPagePipeLine()) .thread(5) .run(); } } public static void loadingDownloadURI() throws IOException { File file = FileUtils.getFile("list.txt"); List<String> result = FileUtils.readLines(file, Charset.forName("utf-8")); result.forEach(uri -> { String newUri = domain + uri; System.out.println(newUri); Spider.create(new DownloadUriProcessor()) .addUrl(newUri) .addPipeline(new DownloadUriPipeLine()) .thread(1) .run(); return; }); } public static void main(String[] args) throws IOException { loadingPageUrl(); System.out.println("-----------------download page success"); loadingDownloadURI(); System.out.println("-----------------download uri success"); } }
package com.yoyo.clourmovice.pipeline; import org.apache.commons.io.FileUtils; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; /** * @Author */ public class BugPagePipeLine implements Pipeline { public static File urlFile = FileUtils.getFile("list.txt"); @Override public void process(ResultItems resultitems, Task task) { Map<String, Object> mapResults = resultitems.getAll(); List<String> uris = (List<String>) mapResults.get("url"); uris.forEach(uri -> { try { FileUtils.write(urlFile, uri + "\r\n", "utf-8", true); } catch (IOException e) { e.printStackTrace(); } }); } }
package com.yoyo.clourmovice.pipeline; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.FileUtils; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; /** * @Author */ public class DownloadUriPipeLine implements Pipeline { public static File urlFile = FileUtils.getFile("uri.txt"); @Override public void process(ResultItems resultItems, Task task) { Map<String, Object> mapResults = resultItems.getAll(); List<String> uris = (List<String>) mapResults.get("url"); if (CollectionUtils.isNotEmpty(uris)) { uris.forEach(uri -> { try { FileUtils.write(urlFile, uri + "\r\n\n", "utf-8", true); } catch (IOException e) { e.printStackTrace(); } }); } } }
package com.yoyo.clourmovice.processor; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; import java.util.List; /** * @Author */ public class BugPageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(100) .setDomain("www.mmbabe.com"); @Override public void process(Page page) { Selectable selectable = page.getHtml().xpath("/html/body/center/form/div/div/table/tbody/tr/td[3]/a[1]/@href"); List<String> result = selectable.all(); page.putField("url", result); } @Override public Site getSite() { site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"); site.addHeader("Accept-Encoding", "gzip, deflate"); site.addHeader("Accept-Language", "zh-CN,zh;q=0.9,zh-TW;q=0.8"); site.addHeader("Cookie", "keep-alive"); site.addHeader("Host", "www.mmbabe.com"); site.addHeader("Upgrade-Insecure-Requests", "1"); site.addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"); site.addHeader("Cookie", "is_use_cookied=yes; is_use_cookiex=yes; cdb_cookietime=2592000; cdb_auth=seDLoqsSbCm%2Fmn%2FrjUbqSXTmIdwNBIcSgmvTCH8sUwMN1QywiXMDlZvFyNbJfTqwUA; cdb_sid=pV3tmY"); return site; } }
package com.yoyo.clourmovice.processor; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.jsoup.helper.StringUtil; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; import java.util.ArrayList; import java.util.List; /** * @Author */ public class DownloadUriProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(100) .setDomain("www.mmbabe.com"); @Override public void process(Page page) { Selectable selectable = page.getHtml().xpath("/html/body/center/div[4]/form/div/table/tbody/tr/td[2]/table/tbody/tr[2]/td/div[2]/text()"); List<String> result = selectable.all(); List<String> newResult = new ArrayList<>(); for (String uri : result) { if (StringUtils.isNotBlank(uri)) { String[] uris = uri.split(" "); if (uris.length > 0) { for (int index =0 ; index < uris.length; index++) { String temp = uris[index]; if (temp.length() > 7 && temp.indexOf("thunder") >= 0) { temp = temp.substring(temp.indexOf("thunder")); newResult.add(temp); } } } } } if (CollectionUtils.isNotEmpty(newResult)) { System.out.println("\t" + newResult); page.putField("url", newResult); } } @Override public Site getSite() { site.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"); site.addHeader("Accept-Encoding", "gzip, deflate"); site.addHeader("Accept-Language", "zh-CN,zh;q=0.9,zh-TW;q=0.8"); site.addHeader("Cookie", "keep-alive"); site.addHeader("Host", "www.mmbabe.com"); site.addHeader("Upgrade-Insecure-Requests", "1"); site.addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"); site.addHeader("Cookie", "is_use_cookied=yes; is_use_cookiex=yes; cdb_cookietime=2592000; cdb_auth=seDLoqsSbCm%2Fmn%2FrjUbqSXTmIdwNBIcSgmvTCH8sUwMN1QywiXMDlZvFyNbJfTqwUA; cdb_sid=pV3tmY"); return site; } }
相关推荐
webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。 最新版:WebMagic-0.7.3 Maven依赖: <groupId>us.codecraft <artifactId>webmagic-core <version>0.7.3 ...
最简单的爬虫设置,最好二次开发的爬虫 WebMagic 框架,它提供简单灵活的API,只需少量代码即可实现一个爬虫。webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),...
springboot 集成webmagic实现网页数据爬取功能 内含项目demo工程 直接导入可使用
爬虫webmagic中文资料
学习使用webmagic 进行静态页面抓取,springboot + webmagic demo项目,进行学习使用 。java 使用webmagic爬取网页数据
WebMagic的结构分为Downloader、PageProcessor、Scheduler、Pipeline四大组件,并由Spider将它们彼此组织起来。这四大组件对应爬虫生命周期中的下载、处理、管理和持久化等功能。该资源提供了webmagic开发时需要用到...
webmagic所需jar包,WebMagic主要包含两个jar包:webmagic-core-{version}.jar和webmagic-extension-{version}.jar。在项目中添加这两个包的依赖,即可使用WebMagic
最新Java WebMagic爬虫教程(包括:HttClient/Jsoup的使用教程)、爬虫案例项目
webmagic是一个开源的Java垂直爬虫框架,目标是简化爬虫的开发流程,让开发者专注于逻辑功能的开发。webmagic的核心非常简单,但是覆盖爬虫的整个流程,也是很好的学习爬虫开发的材料。作者曾经在前公司进行过一年的...
基于WebMagic爬虫框架的全部依赖jar包,下载即用,如何使用可以查看我的博客有详细的讲解基于WebMagic爬虫框架的爬虫开发。
webmagic相关jar包,需要也可以前往,https://blog.csdn.net/qq_40374604,
Webmagic爬取数据导入到数据库与Elasticsearch5,详细介绍请参考:http://blog.csdn.net/u011781521/article/details/77866642
基于webmagic的网络爬虫入门demo 希望对大家有所帮助
webmagic开发所需要的全部jar。如果不使用maven可以下载这个jar来使用。webmagic开发所需要的全部jar 0.5.2
webmagic框架所需要的jar包,相关文档资料请参考官方网站。http://webmagic.io/docs/zh/posts/ch2-install/without-maven.html
web爬虫WebMagic-0.7.3源代码及示例,在项目中一直在使用该版本,暂没发现bug。
webmagic的所需要的所有jar包,最新的,0.6.0版本的。
WebMagic(Java)爬虫实现,实现数据爬取,并导出到excel文件
webmagic所有依赖的jar包,亲测可用,全部都有,最新版本哟。