快速开始
1. 添加Maven依赖
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>1.0.3</version>
</dependency>
2. 编写PageProcessor
public class GithubRepoPageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override
public void process(Page page) {
page.addTargetRequests(page.getHtml().links()
.regex("(https://github\\.com/\\w+/\\w+)").all());
page.putField("author", page.getUrl()
.regex("https://github\\.com/(\\w+)/.*").toString());
page.putField("name", page.getHtml()
.xpath("//h1[@class='public']/strong/a/text()").toString());
}
@Override
public Site getSite() {
return site;
}
}
3. 启动爬虫
Spider.create(new GithubRepoPageProcessor())
.addUrl("https://github.com/code4craft")
.addPipeline(new ConsolePipeline())
.thread(5)
.run();