public class CrawlController extends Configurable
| Modifier and Type | Class and Description |
|---|---|
static interface |
CrawlController.WebCrawlerFactory<T extends WebCrawler> |
| Modifier and Type | Field and Description |
|---|---|
protected List<Object> |
crawlersLocalData
Once the crawling session finishes the controller collects the local data
of the crawler threads and stores them in this List.
|
protected Object |
customData
The 'customData' object can be used for passing custom crawl-related
configurations to different components of the crawler.
|
protected DocIDServer |
docIdServer |
protected com.sleepycat.je.Environment |
env |
protected boolean |
finished
Is the crawling of this session finished?
|
protected Frontier |
frontier |
protected PageFetcher |
pageFetcher |
protected RobotstxtServer |
robotstxtServer |
protected boolean |
shuttingDown
Is the crawling session set to 'shutdown'.
|
protected Object |
waitingLock |
config| Constructor and Description |
|---|
CrawlController(CrawlConfig config,
PageFetcher pageFetcher,
RobotstxtServer robotstxtServer) |
| Modifier and Type | Method and Description |
|---|---|
void |
addSeed(String pageUrl)
Adds a new seed URL.
|
void |
addSeed(String pageUrl,
int docId)
Adds a new seed URL.
|
void |
addSeenUrl(String url,
int docId)
This function can called to assign a specific document id to a url.
|
List<Object> |
getCrawlersLocalData()
Once the crawling session finishes the controller collects the local data of the crawler
threads and stores them
in a List.
|
Object |
getCustomData() |
DocIDServer |
getDocIdServer() |
Frontier |
getFrontier() |
PageFetcher |
getPageFetcher() |
RobotstxtServer |
getRobotstxtServer() |
boolean |
isFinished() |
boolean |
isShuttingDown() |
void |
setCustomData(Object customData) |
void |
setDocIdServer(DocIDServer docIdServer) |
void |
setFrontier(Frontier frontier) |
void |
setPageFetcher(PageFetcher pageFetcher) |
void |
setRobotstxtServer(RobotstxtServer robotstxtServer) |
void |
shutdown()
Set the current crawling session set to 'shutdown'.
|
protected static void |
sleep(int seconds) |
<T extends WebCrawler> |
start(Class<T> clazz,
int numberOfCrawlers)
Start the crawling session and wait for it to finish.
|
<T extends WebCrawler> |
start(CrawlController.WebCrawlerFactory<T> crawlerFactory,
int numberOfCrawlers)
Start the crawling session and wait for it to finish.
|
protected <T extends WebCrawler> |
start(CrawlController.WebCrawlerFactory<T> crawlerFactory,
int numberOfCrawlers,
boolean isBlocking) |
<T extends WebCrawler> |
startNonBlocking(Class<T> clazz,
int numberOfCrawlers)
Start the crawling session and return immediately.
|
<T extends WebCrawler> |
startNonBlocking(CrawlController.WebCrawlerFactory<T> crawlerFactory,
int numberOfCrawlers)
Start the crawling session and return immediately.
|
void |
waitUntilFinish()
Wait until this crawling session finishes.
|
getConfigprotected Object customData
protected List<Object> crawlersLocalData
protected boolean finished
protected boolean shuttingDown
protected PageFetcher pageFetcher
protected RobotstxtServer robotstxtServer
protected Frontier frontier
protected DocIDServer docIdServer
protected final Object waitingLock
protected final com.sleepycat.je.Environment env
public CrawlController(CrawlConfig config, PageFetcher pageFetcher, RobotstxtServer robotstxtServer) throws Exception
Exceptionpublic <T extends WebCrawler> void start(Class<T> clazz, int numberOfCrawlers)
T - Your class extending WebCrawlerclazz - the class that implements the logic for crawler threadsnumberOfCrawlers - the number of concurrent threads that will be contributing in
this crawling session.public <T extends WebCrawler> void start(CrawlController.WebCrawlerFactory<T> crawlerFactory, int numberOfCrawlers)
T - Your class extending WebCrawlercrawlerFactory - factory to create crawlers on demand for each threadnumberOfCrawlers - the number of concurrent threads that will be contributing in
this crawling session.public <T extends WebCrawler> void startNonBlocking(CrawlController.WebCrawlerFactory<T> crawlerFactory, int numberOfCrawlers)
T - Your class extending WebCrawlercrawlerFactory - factory to create crawlers on demand for each threadnumberOfCrawlers - the number of concurrent threads that will be contributing in
this crawling session.public <T extends WebCrawler> void startNonBlocking(Class<T> clazz, int numberOfCrawlers)
T - Your class extending WebCrawlerclazz - the class that implements the logic for crawler threadsnumberOfCrawlers - the number of concurrent threads that will be contributing in
this crawling session.protected <T extends WebCrawler> void start(CrawlController.WebCrawlerFactory<T> crawlerFactory, int numberOfCrawlers, boolean isBlocking)
public void waitUntilFinish()
public List<Object> getCrawlersLocalData()
protected static void sleep(int seconds)
public void addSeed(String pageUrl)
pageUrl - the URL of the seedpublic void addSeed(String pageUrl, int docId)
pageUrl - the URL of the seeddocId - the document id that you want to be assigned to this seed URL.public void addSeenUrl(String url, int docId)
url - the URL of the pagedocId - the document id that you want to be assigned to this URL.public PageFetcher getPageFetcher()
public void setPageFetcher(PageFetcher pageFetcher)
public RobotstxtServer getRobotstxtServer()
public void setRobotstxtServer(RobotstxtServer robotstxtServer)
public Frontier getFrontier()
public void setFrontier(Frontier frontier)
public DocIDServer getDocIdServer()
public void setDocIdServer(DocIDServer docIdServer)
public Object getCustomData()
public void setCustomData(Object customData)
public boolean isFinished()
public boolean isShuttingDown()
public void shutdown()
Copyright © 2017. All rights reserved.