webmagic有一个selenium模块,其中实现了一个SeleniumDownloader。但是感觉灵活性不大。所以我就自己参考实现了一个。
首先是WebDriverPool用来管理WebDriver池:
import java.util.ArrayList; import java.util.concurrent.BlockingDeque; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import org.openqa.selenium.WebDriver; import org.openqa.selenium.phantomjs.PhantomJSDriver; import org.openqa.selenium.phantomjs.PhantomJSDriverService; import org.openqa.selenium.remote.DesiredCapabilities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import net.xby1993.common.util.FileUtil; /** * @author taojw */ public class WebDriverPool { private Logger logger = LoggerFactory.getLogger(getClass()); private int CAPACITY = 5; private AtomicInteger refCount = new AtomicInteger(0); private static final String DRIVER_PHANTOMJS = "phantomjs"; /** * store webDrivers available */ private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>( CAPACITY); private static String PHANTOMJS_PATH; private static DesiredCapabilities caps = DesiredCapabilities.phantomjs(); static { PHANTOMJS_PATH = FileUtil.getCommonProp("phantomjs.path"); caps.setJavascriptEnabled(true); caps.setCapability( PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, PHANTOMJS_PATH); caps.setCapability("takesScreenshot", false); caps.setCapability( PhantomJSDriverService.PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX + "User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"); ArrayList<String> cliArgsCap = new ArrayList<String>(); //http://phantomjs.org/api/command-line.html cliArgsCap.add("--web-security=false"); cliArgsCap.add("--ssl-protocol=any"); cliArgsCap.add("--ignore-ssl-errors=true"); cliArgsCap.add("--load-images=false"); //不加载图片 caps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS, cliArgsCap); caps.setCapability( PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS, new String[] {"--logLevel=INFO"}); } public WebDriverPool() { } public WebDriverPool(int poolsize) { this.CAPACITY = poolsize; innerQueue = new LinkedBlockingDeque<WebDriver>(poolsize); } public WebDriver get() throws InterruptedException { WebDriver poll = innerQueue.poll(); if (poll != null) { return poll; } if (refCount.get() < CAPACITY) { synchronized (innerQueue) { if (refCount.get() < CAPACITY) { WebDriver mDriver = new PhantomJSDriver(caps); // 尝试性解决:https://github.com/ariya/phantomjs/issues/11526问题 mDriver.manage().timeouts() .pageLoadTimeout(60, TimeUnit.SECONDS); // mDriver.manage().window().setSize(new Dimension(1366, // 768)); innerQueue.add(mDriver); refCount.incrementAndGet(); } } } return innerQueue.take(); } public void returnToPool(WebDriver webDriver) { // webDriver.quit(); // webDriver=null; innerQueue.add(webDriver); } public void close(WebDriver webDriver) { refCount.decrementAndGet(); webDriver.quit(); webDriver = null; } public void shutdown() { try { for (WebDriver driver : innerQueue) { close(driver); } innerQueue.clear(); } catch (Exception e) { // e.printStackTrace(); logger.warn("webdriverpool关闭失败",e); } } } |
之后便是SeleniumDownloader
import org.openqa.selenium.By; import org.openqa.selenium.Cookie; import org.openqa.selenium.WebDriver; import org.openqa.selenium.WebElement; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.downloader.Downloader; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.PlainText; import us.codecraft.webmagic.utils.UrlUtils; import java.util.Map; /** * @author taojw * */ public class SeleniumDownloader implements Downloader{ private static final Logger log=LoggerFactory.getLogger(SeleniumDownloader.class); private int sleepTime=3000;//3s private SeleniumAction action=null; private WebDriverPool webDriverPool=new WebDriverPool(); public SeleniumDownloader(){ } public SeleniumDownloader(int sleepTime,WebDriverPool pool){ this(sleepTime,pool,null); } public SeleniumDownloader(int sleepTime,WebDriverPool pool,SeleniumAction action){ this.sleepTime=sleepTime; this.action=action; if(pool!=null){ webDriverPool=pool; } } public SeleniumDownloader setSleepTime(int sleepTime) { this.sleepTime = sleepTime; return this; } public void setOperator(SeleniumAction action){ this.action=action; } @Override public Page download(Request request, Task task) { WebDriver webDriver; try { webDriver = webDriverPool.get(); } catch (InterruptedException e) { log.warn("interrupted", e); return null; } log.info("downloading page " + request.getUrl()); Page page = new Page(); try { webDriver.get(request.getUrl()); Thread.sleep(sleepTime); } catch (InterruptedException e) { e.printStackTrace(); } catch (Exception e) { webDriverPool.close(webDriver); page.setSkip(true); return page; } // WindowUtil.changeWindow(webDriver); WebDriver.Options manage = webDriver.manage(); Site site = task.getSite(); if (site.getCookies() != null) { for (Map.Entry<String, String> cookieEntry : site.getCookies() .entrySet()) { Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue()); manage.addCookie(cookie); } } manage.window().maximize(); if(action!=null){ action.execute(webDriver); } SeleniumAction reqAction=(SeleniumAction) request.getExtra("action"); if(reqAction!=null){ reqAction.execute(webDriver); } WebElement webElement = webDriver.findElement(By.xpath("/html")); String content = webElement.getAttribute("outerHTML"); page.setRawText(content); page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, webDriver.getCurrentUrl()))); page.setUrl(new PlainText(webDriver.getCurrentUrl())); page.setRequest(request); webDriverPool.returnToPool(webDriver); return page; } @Override public void setThread(int thread) { } } |
这里的扩展性主要体现在,我加入了SeleniumAction接口,可以在SeleniumDownloader初始化的时候配置一个全局的SeleniumAction,以及为每个Request配置对应的SeleniumAction。 SeleniumAction接口如下:
public interface SeleniumAction { void execute(WebDriver driver); } |
它会获得一个WebDriver实例,你可以在里面进行任意的Selenium操作。
上文内容不用于商业目的,如涉及知识产权问题,请权利人联系博为峰小编(021-64471599-8017),我们将立即处理。