public class Spider extends Object implements Runnable, Task
Downloader
,
Scheduler
,
PageProcessor
,
Pipeline
Modifier and Type | Class and Description |
---|---|
static class |
Spider.Status |
Modifier and Type | Field and Description |
---|---|
protected boolean |
destroyWhenExit |
protected Downloader |
downloader |
protected ExecutorService |
executorService |
protected boolean |
exitWhenComplete |
protected org.slf4j.Logger |
logger |
protected PageProcessor |
pageProcessor |
protected List<Pipeline> |
pipelines |
protected Scheduler |
scheduler |
protected Site |
site |
protected boolean |
spawnUrl |
protected List<Request> |
startRequests |
protected AtomicInteger |
stat |
protected static int |
STAT_INIT |
protected static int |
STAT_RUNNING |
protected static int |
STAT_STOPPED |
protected int |
threadNum |
protected CountableThreadPool |
threadPool |
protected String |
uuid |
Constructor and Description |
---|
Spider(PageProcessor pageProcessor)
create a spider with pageProcessor.
|
Modifier and Type | Method and Description |
---|---|
Spider |
addPipeline(Pipeline pipeline)
add a pipeline for Spider
|
Spider |
addRequest(Request... requests)
Add urls with information to crawl.
|
Spider |
addUrl(String... urls)
Add urls to crawl.
|
protected void |
checkIfRunning() |
Spider |
clearPipeline()
clear the pipelines set
|
void |
close() |
static Spider |
create(PageProcessor pageProcessor)
create a spider with pageProcessor.
|
Spider |
downloader(Downloader downloader)
Deprecated.
|
protected void |
extractAndAddRequests(Page page,
boolean spawnUrl) |
<T> T |
get(String url) |
<T> List<T> |
getAll(Collection<String> urls)
Download urls synchronizing.
|
protected CollectorPipeline |
getCollectorPipeline() |
long |
getPageCount()
Get page count downloaded by spider.
|
Scheduler |
getScheduler() |
Site |
getSite()
site of a task
|
List<SpiderListener> |
getSpiderListeners() |
Date |
getStartTime() |
Spider.Status |
getStatus()
Get running status by spider.
|
int |
getThreadAlive()
Get thread count which is running
|
String |
getUUID()
unique id for a task.
|
protected void |
initComponent() |
boolean |
isExitWhenComplete() |
boolean |
isSpawnUrl() |
protected void |
onError(Request request) |
protected void |
onSuccess(Request request) |
Spider |
pipeline(Pipeline pipeline)
Deprecated.
|
void |
run() |
void |
runAsync() |
Spider |
scheduler(Scheduler scheduler)
Deprecated.
|
Spider |
setDownloader(Downloader downloader)
set the downloader of spider
|
void |
setEmptySleepTime(int emptySleepTime)
Set wait time when no url is polled.
|
Spider |
setExecutorService(ExecutorService executorService) |
Spider |
setExitWhenComplete(boolean exitWhenComplete)
Exit when complete.
|
Spider |
setPipelines(List<Pipeline> pipelines)
set pipelines for Spider
|
Spider |
setScheduler(Scheduler scheduler)
set scheduler for Spider
|
Spider |
setSpawnUrl(boolean spawnUrl)
Whether add urls extracted to download.
Add urls to download when it is true, and just download seed urls when it is false. |
Spider |
setSpiderListeners(List<SpiderListener> spiderListeners) |
Spider |
setUUID(String uuid)
Set an uuid for spider.
Default uuid is domain of site. |
protected void |
sleep(int time) |
void |
start() |
Spider |
startRequest(List<Request> startRequests)
Set startUrls of Spider.
Prior to startUrls of Site. |
Spider |
startUrls(List<String> startUrls)
Set startUrls of Spider.
Prior to startUrls of Site. |
void |
stop() |
void |
test(String... urls)
Process specific urls without url discovering.
|
Spider |
thread(ExecutorService executorService,
int threadNum)
start with more than one threads
|
Spider |
thread(int threadNum)
start with more than one threads
|
protected Downloader downloader
protected PageProcessor pageProcessor
protected Site site
protected String uuid
protected Scheduler scheduler
protected org.slf4j.Logger logger
protected CountableThreadPool threadPool
protected ExecutorService executorService
protected int threadNum
protected AtomicInteger stat
protected boolean exitWhenComplete
protected static final int STAT_INIT
protected static final int STAT_RUNNING
protected static final int STAT_STOPPED
protected boolean spawnUrl
protected boolean destroyWhenExit
public Spider(PageProcessor pageProcessor)
pageProcessor
- pageProcessorpublic static Spider create(PageProcessor pageProcessor)
pageProcessor
- pageProcessorPageProcessor
public Spider startUrls(List<String> startUrls)
startUrls
- startUrlspublic Spider startRequest(List<Request> startRequests)
startRequests
- startRequestspublic Spider setUUID(String uuid)
uuid
- uuid@Deprecated public Spider scheduler(Scheduler scheduler)
scheduler
- schedulersetScheduler(us.codecraft.webmagic.scheduler.Scheduler)
public Spider setScheduler(Scheduler scheduler)
scheduler
- schedulerScheduler
public Spider pipeline(Pipeline pipeline)
pipeline
- pipelineaddPipeline(us.codecraft.webmagic.pipeline.Pipeline)
public Spider addPipeline(Pipeline pipeline)
pipeline
- pipelinePipeline
public Spider setPipelines(List<Pipeline> pipelines)
pipelines
- pipelinesPipeline
public Spider clearPipeline()
public Spider downloader(Downloader downloader)
downloader
- downloadersetDownloader(us.codecraft.webmagic.downloader.Downloader)
public Spider setDownloader(Downloader downloader)
downloader
- downloaderDownloader
protected void initComponent()
protected void onError(Request request)
protected void onSuccess(Request request)
public void close()
public void test(String... urls)
urls
- urls to processprotected void sleep(int time)
protected void extractAndAddRequests(Page page, boolean spawnUrl)
protected void checkIfRunning()
public void runAsync()
public Spider addUrl(String... urls)
urls
- urlspublic <T> List<T> getAll(Collection<String> urls)
T
- type of process resulturls
- urlsprotected CollectorPipeline getCollectorPipeline()
public <T> T get(String url)
public Spider addRequest(Request... requests)
requests
- requestspublic void start()
public void stop()
public Spider thread(int threadNum)
threadNum
- threadNumpublic Spider thread(ExecutorService executorService, int threadNum)
executorService
- executorService to run the spiderthreadNum
- threadNumpublic boolean isExitWhenComplete()
public Spider setExitWhenComplete(boolean exitWhenComplete)
exitWhenComplete
- exitWhenCompletepublic boolean isSpawnUrl()
public long getPageCount()
public Spider.Status getStatus()
Spider.Status
public int getThreadAlive()
public Spider setSpawnUrl(boolean spawnUrl)
spawnUrl
- spawnUrlpublic Spider setExecutorService(ExecutorService executorService)
public List<SpiderListener> getSpiderListeners()
public Spider setSpiderListeners(List<SpiderListener> spiderListeners)
public Date getStartTime()
public Scheduler getScheduler()
public void setEmptySleepTime(int emptySleepTime)
emptySleepTime
- In MILLISECONDS.Copyright © 2017. All rights reserved.