Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 17 additions & 55 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Spider.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,8 @@
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
Expand Down Expand Up @@ -75,9 +72,9 @@ public class Spider implements Runnable, Task {
protected Site site;

protected String uuid;

protected Scheduler scheduler = new QueueScheduler();

protected SpiderScheduler scheduler;
protected Logger logger = LoggerFactory.getLogger(getClass());

protected CountableThreadPool threadPool;
Expand All @@ -100,10 +97,6 @@ public class Spider implements Runnable, Task {

protected boolean destroyWhenExit = true;

private ReentrantLock newUrlLock = new ReentrantLock();

private Condition newUrlCondition = newUrlLock.newCondition();

private List<SpiderListener> spiderListeners;

private final AtomicLong pageCount = new AtomicLong(0);
Expand Down Expand Up @@ -131,6 +124,7 @@ public static Spider create(PageProcessor pageProcessor) {
public Spider(PageProcessor pageProcessor) {
this.pageProcessor = pageProcessor;
this.site = pageProcessor.getSite();
this.scheduler = new SpiderScheduler(new QueueScheduler());
}

/**
Expand Down Expand Up @@ -186,15 +180,15 @@ public Spider scheduler(Scheduler scheduler) {
/**
* set scheduler for Spider
*
* @param scheduler scheduler
* @param updateScheduler scheduler
* @return this
* @see Scheduler
* @since 0.2.1
*/
public Spider setScheduler(Scheduler scheduler) {
public Spider setScheduler(Scheduler updateScheduler) {
checkIfRunning();
Scheduler oldScheduler = this.scheduler;
this.scheduler = scheduler;
SpiderScheduler oldScheduler = this.scheduler;
scheduler.setScheduler(updateScheduler);
if (oldScheduler != null) {
Request request;
while ((request = oldScheduler.poll(this)) != null) {
Expand All @@ -213,7 +207,7 @@ public Spider setScheduler(Scheduler scheduler) {
* @deprecated
*/
@Deprecated
public Spider pipeline(Pipeline pipeline) {
public Spider pipeline(Pipeline pipeline) {
return addPipeline(pipeline);
}

Expand Down Expand Up @@ -264,7 +258,7 @@ public Spider clearPipeline() {
* @deprecated
*/
@Deprecated
public Spider downloader(Downloader downloader) {
public Spider downloader(Downloader downloader) {
return setDownloader(downloader);
}

Expand Down Expand Up @@ -333,10 +327,10 @@ public void run() {
}
} else {
// wait until new url added,
if (waitNewUrl()) {
//if interrupted
if (scheduler.waitNewUrl(threadPool, emptySleepTime)) {
// if interrupted
break;
}
}
continue;
}
}
Expand All @@ -353,7 +347,7 @@ public void run() {
logger.error("process request " + request + " error", e);
} finally {
pageCount.incrementAndGet();
signalNewUrl();
scheduler.signalNewUrl();
}
}
});
Expand Down Expand Up @@ -536,7 +530,7 @@ public Spider addUrl(String... urls) {
for (String url : urls) {
addRequest(new Request(url));
}
signalNewUrl();
scheduler.signalNewUrl();
return this;
}

Expand Down Expand Up @@ -588,42 +582,10 @@ public Spider addRequest(Request... requests) {
for (Request request : requests) {
addRequest(request);
}
signalNewUrl();
scheduler.signalNewUrl();
return this;
}

/**
*
* @return isInterrupted
*/
private boolean waitNewUrl() {
// now there may not be any thread live
newUrlLock.lock();
try {
//double check,unnecessary, unless very fast concurrent
if (threadPool.getThreadAlive() == 0) {
return false;
}
//wait for amount of time
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
// logger.warn("waitNewUrl - interrupted, error {}", e);
return true;
} finally {
newUrlLock.unlock();
}
}

private void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}

public void start() {
runAsync();
}
Expand Down Expand Up @@ -799,7 +761,7 @@ public Date getStartTime() {
}

public Scheduler getScheduler() {
return scheduler;
return scheduler.getScheduler();
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package us.codecraft.webmagic;

import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;

public class SpiderScheduler {
private Scheduler scheduler;
private final ReentrantLock newUrlLock = new ReentrantLock();
private final Condition newUrlCondition = newUrlLock.newCondition();

public SpiderScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}

public Scheduler getScheduler() {
return scheduler;
}

public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}

public Request poll(Spider spider) {
return scheduler.poll(spider);
}

public void push(Request request, Spider spider) {
scheduler.push(request, spider);
}

public boolean waitNewUrl(CountableThreadPool threadPool, long emptySleepTime) {
newUrlLock.lock();
try {
if (threadPool.getThreadAlive() == 0) {
return false;
}
newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
return false;
} catch (InterruptedException e) {
return true;
} finally {
newUrlLock.unlock();
}
}

public void signalNewUrl() {
try {
newUrlLock.lock();
newUrlCondition.signalAll();
} finally {
newUrlLock.unlock();
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ private HttpClientContext convertHttpClientContext(Request request, Site site, P
HttpClientContext httpContext = new HttpClientContext();
if (proxy != null && proxy.getUsername() != null) {
AuthState authState = new AuthState();
authState.update(new BasicScheme(ChallengeState.PROXY), new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword()));
BasicScheme proxyAuthScheme = new BasicScheme(ChallengeState.PROXY);
UsernamePasswordCredentials proxyCredentials = new UsernamePasswordCredentials(proxy.getUsername(), proxy.getPassword());
authState.update(proxyAuthScheme, proxyCredentials);
httpContext.setAttribute(HttpClientContext.PROXY_AUTH_STATE, authState);
}
if (request.getCookies() != null && !request.getCookies().isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ protected List<Element> getElements() {
return elements;
}

@Override
public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ public Selectable xpath(String xpath) {
throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}

@Override
public Selectable smartContent() {
throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
}

@Override
public Selectable links() {
throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc).");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,6 @@ public interface Selectable {
* @return new Selectable after extract
*/
public Selectable css(String selector, String attrName);

/**
* select smart content with ReadAbility algorithm
*
* @return content
*/
public Selectable smartContent();

/**
* select all links
*
Expand Down
Loading