网页主动探测工具使用

发表于:2014-12-02 10:32

字体: | 上一篇 | 下一篇 | 我要投稿

 作者:壹頁書    来源:51Testing软件测试网采编

private interface Filter {
void doFilter(Task fatherTask, Task newTask, String path, Filter chain);
}
private class FilterChain implements Filter {
private List<Filter> list = new ArrayList<Filter>();
{
addFilter(new TwoLevel());
addFilter(new OneLevel());
addFilter(new FullPath());
addFilter(new Root());
addFilter(new Default());
}
private void addFilter(Filter filter) {
list.add(filter);
}
private Iterator<Filter> it = list.iterator();
@Override
public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {
if (it.hasNext()) {
it.next().doFilter(fatherTask, newTask, path, chain);
}
}
}
private class TwoLevel implements Filter {
@Override
public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {
if (path.startsWith("../../")) {
String prefix = getPrefix(fatherTask.getCurrentPath(), 3);
newTask.init(fatherTask.getHost(), fatherTask.getPort(), path.replace("../../", prefix));
} else {
chain.doFilter(fatherTask, newTask, path, chain);
}
}
}
private class OneLevel implements Filter {
@Override
public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {
if (path.startsWith("../")) {
String prefix = getPrefix(fatherTask.getCurrentPath(), 2);
newTask.init(fatherTask.getHost(), fatherTask.getPort(), path.replace("../", prefix));
} else {
chain.doFilter(fatherTask, newTask, path, chain);
}
}
}
private class FullPath implements Filter {
@Override
public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {
if (path.startsWith("http://")) {
Iterator<String> it = domainlist.iterator();
boolean flag = false;
while (it.hasNext()) {
String domain = it.next();
if (path.startsWith("http://" + domain + "/")) {
newTask.init(domain, fatherTask.getPort(), path.replace("http://" + domain + "/", "/"));
flag = true;
break;
}
}
if (!flag) {
newTask = null;
}
} else {
chain.doFilter(fatherTask, newTask, path, chain);
}
}
}
private class Root implements Filter {
@Override
public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {
if (path.startsWith("/")) {
newTask.init(fatherTask.getHost(), fatherTask.getPort(), path);
} else {
chain.doFilter(fatherTask, newTask, path, chain);
}
}
}
private class Default implements Filter {
@Override
public void doFilter(Task fatherTask, Task newTask, String path, Filter chain) {
String prefix = getPrefix(fatherTask.getCurrentPath(), 1);
newTask.init(fatherTask.getHost(), fatherTask.getPort(), prefix + "/" + path);
}
}
public ParseHandler(BlockingQueue<Task> connectlist, BlockingQueue<Task> parselist,
BlockingQueue<Task> persistencelist, List<String> domainlist) {
this.connectlist = connectlist;
this.parselist = parselist;
this.persistencelist = persistencelist;
this.domainlist = domainlist;
}
private Pattern pattern = Pattern.compile("\"[^\"]+\\.htm[^\"]*\"");
private void handler() {
try {
Task task = parselist.take();
parseTaskState(task);
if (200 == task.getState()) {
Matcher matcher = pattern.matcher(task.getContent());
while (matcher.find()) {
String path = matcher.group();
if (!path.contains(" ") && !path.contains("\t") && !path.contains("(") && !path.contains(")")
&& !path.contains(":")) {
path = path.substring(1, path.length() - 1);
if (!SET.contains(path)) {
SET.add(path);
createNewTask(task, path);
}
}
}
}
task.setContent(null);
persistencelist.put(task);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void parseTaskState(Task task) {
if (task.getContent().startsWith("HTTP/1.1")) {
task.setState(Integer.parseInt(task.getContent().substring(9, 12)));
} else {
task.setState(Integer.parseInt(task.getContent().substring(19, 22)));
}
}
/**
* @param fatherTask
* @param path
* @throws Exception
*/
private void createNewTask(Task fatherTask, String path) throws Exception {
Task newTask = new Task();
FilterChain filterchain = new FilterChain();
filterchain.doFilter(fatherTask, newTask, path, filterchain);
if (newTask != null) {
connectlist.put(newTask);
}
}
private String getPrefix(String s, int count) {
String prefix = s;
while (count > 0) {
prefix = prefix.substring(0, prefix.lastIndexOf("/"));
count--;
}
return "".equals(prefix) ? "/" : prefix;
}
@Override
public void run() {
while (true) {
this.handler();
COUNT.addAndGet(1);
}
}
}
class ConnectHandler implements Runnable {
public static int GETCOUNT() {
return COUNT.get();
}
private static final AtomicInteger COUNT = new AtomicInteger();
private BlockingQueue<Task> connectlist;
private BlockingQueue<Task> parselist;
public ConnectHandler(BlockingQueue<Task> connectlist, BlockingQueue<Task> parselist) {
this.connectlist = connectlist;
this.parselist = parselist;
}
private void handler() {
try {
Task task = connectlist.take();
long start = System.currentTimeMillis();
getHtml(task);
long end = System.currentTimeMillis();
task.setTaskTime(end - start);
parselist.put(task);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void getHtml(Task task) throws Exception {
StringBuilder sb = new StringBuilder(2048);
InetAddress addr = InetAddress.getByName(task.getHost());
// 建立一个Socket
Socket socket = new Socket(addr, task.getPort());
// 发送命令,无非就是在Socket发送流的基础上加多一些握手信息,详情请了解HTTP协议
BufferedWriter wr = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream(), "UTF-8"));
wr.write("GET " + task.getCurrentPath() + " HTTP/1.0\r\n");
wr.write("HOST:" + task.getHost() + "\r\n");
wr.write("Accept:*/*\r\n");
wr.write("\r\n");
wr.flush();
// 接收Socket返回的结果,并打印出来
BufferedReader rd = new BufferedReader(new InputStreamReader(socket.getInputStream()));
String line;
while ((line = rd.readLine()) != null) {
sb.append(line);
}
wr.close();
rd.close();
task.setContent(sb.toString());
socket.close();
}
@Override
public void run() {
while (true) {
this.handler();
COUNT.addAndGet(1);
}
}
}
class PersistenceHandler implements Runnable {
static {
try {
Class.forName("oracle.jdbc.OracleDriver");
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static int GETCOUNT() {
return COUNT.get();
}
private static final AtomicInteger COUNT = new AtomicInteger();
private BlockingQueue<Task> persistencelist;
public PersistenceHandler(BlockingQueue<Task> persistencelist) {
this.persistencelist = persistencelist;
try {
conn = DriverManager.getConnection("jdbc:oracle:thin:127.0.0.1:1521:orcl", "edmond", "edmond");
ps = conn
.prepareStatement("insert into probe(id,host,path,state,tasktime,type) values(seq_probe_id.nextval,?,?,?,?,?)");
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private Connection conn;
private PreparedStatement ps;
@Override
public void run() {
while (true) {
this.handler();
COUNT.addAndGet(1);
}
}
private void handler() {
try {
Task task = persistencelist.take();
ps.setString(1, task.getHost());
ps.setString(2, task.getCurrentPath());
ps.setInt(3, task.getState());
ps.setLong(4, task.getTaskTime());
ps.setString(5, task.getType());
ps.executeUpdate();
conn.commit();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
  ParseHandler 使用了一个职责链模式,
  TwoLevel 处理../../开头的连接(../../sucai/sucai.htm)
  OneLevel 处理../开头的连接(../sucai/sucai.htm)
  FullPath 处理绝对路径的连接(http://域名/sucai/sucai.htm)
  Root 处理/开头的连接(/sucai/sucai.htm)
  Default 处理常规的连接(sucai.htm)
  ParseHandler FullPath 过滤需要一个白名单.
  这样可以使程序在固定的域名爬行
  ParseHandler parseTaskState 解析状态码 可能需要根据实际情况进行调整
  比如网页404,服务器可能会返回一个错误页,而不是通常的HTTP状态码。
  第一版仅仅实现了功能,错误处理不完整,
  所以仅仅在定制的域名下生效,其实并不通用,后续会逐步完善.
33/3<123
《2023软件测试行业现状调查报告》独家发布~

关注51Testing

联系我们

快捷面板 站点地图 联系我们 广告服务 关于我们 站长统计 发展历程

法律顾问:上海兰迪律师事务所 项棋律师
版权所有 上海博为峰软件技术股份有限公司 Copyright©51testing.com 2003-2024
投诉及意见反馈:webmaster@51testing.com; 业务联系:service@51testing.com 021-64471599-8017

沪ICP备05003035号

沪公网安备 31010102002173号