diff --git a/WebContent/img/baijiahao_logo.png b/WebContent/img/baijiahao_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..71dc9cc7e44b0638d167642c2b894b6263d47c7f
Binary files /dev/null and b/WebContent/img/baijiahao_logo.png differ
diff --git a/WebContent/img/bd_developer_logo.png b/WebContent/img/bd_developer_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c32468a059eb833ed0a67fd104971120c29a721
Binary files /dev/null and b/WebContent/img/bd_developer_logo.png differ
diff --git a/WebContent/img/wechat_logo.png b/WebContent/img/wechat_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..892de473b30378d13e15daf342c7b401c886a1f7
Binary files /dev/null and b/WebContent/img/wechat_logo.png differ
diff --git a/WebContent/index.html b/WebContent/index.html
index 618c88f8de5b3a4e713d7444ef59fb5663860ee9..6cad920dfb03cdadb4831ce5f110051dc702da7f 100644
--- a/WebContent/index.html
+++ b/WebContent/index.html
@@ -53,6 +53,9 @@
src="img/wordpress-logo.jpg" />
+
+
+
diff --git a/src/common/Spider.xml b/src/common/Spider.xml
index cf674b4c28f6d008388fc610e312a90f4da39941..ca98fcc4b6258de17c12c4de3896a139888ad3b4 100644
--- a/src/common/Spider.xml
+++ b/src/common/Spider.xml
@@ -188,7 +188,7 @@
- wordpress
+ wordpress
]]>
@@ -214,7 +214,7 @@
]]>
-
+
@@ -230,4 +230,90 @@
cpp
+
+
+
+ mp.weixin.qq.com
+ utf-8
+ weixin.qq.com
+ /s/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ]]>
+ ]]>
+ ]]>
+
+
+
+
+
+ developer.baidu.com
+ utf-8
+ baidu.com
+ /topic/show/
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ]]>
+ ]]>
+ ]]>
+
+
+
+
+
+ baijiahao.baidu.com
+ utf-8
+ baidu.com
+ /s
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ]]>
+ ]]>
+ ]]>
+
+
\ No newline at end of file
diff --git a/src/common/SpiderConfigTool.java b/src/common/SpiderConfigTool.java
index 99ce6985beb4e52c741eab0aa4718504822adc88..e6924629597a106ca4e68a2832cf19cd7be3dc57 100644
--- a/src/common/SpiderConfigTool.java
+++ b/src/common/SpiderConfigTool.java
@@ -15,7 +15,7 @@ public class SpiderConfigTool {
private SAXReader reader;
private Document doc;
private Node spiderNode;
-
+
public SpiderConfigTool(String spiderName) throws DocumentException{
file = new File(this.getClass().getResource("").getPath()+configPath);
@@ -24,29 +24,30 @@ public class SpiderConfigTool {
}
reader = new SAXReader();
doc = reader.read(file);
- spiderNode = getSpider(spiderName);
-
+ spiderNode = getSpider(spiderName);
+
}
-
+
public Node getSpiderNode(){
return spiderNode;
}
-
+
@SuppressWarnings("unchecked")
private Node getSpider(String spiderName){
List list = doc.selectNodes("config/spider-cofig");
-
+
for(Node i : list){
- if(i.selectSingleNode("name").getText().equals(spiderName)){
+ Node domain = i.selectSingleNode("domain");
+ if (domain != null && domain.getText().equals(spiderName)) {
return i;
}
}
-
+
return null;
}
-
+
public Document getDoc(){
return doc;
}
-
+
}
diff --git a/src/spider/BlogPageProcessor.java b/src/spider/BlogPageProcessor.java
index eb783b68062538f1aab0147050f64b89cc219e98..99802be3d426cd19b3a443cb02456248307183a1 100644
--- a/src/spider/BlogPageProcessor.java
+++ b/src/spider/BlogPageProcessor.java
@@ -54,9 +54,10 @@ public class BlogPageProcessor implements PageProcessor{
String spiderName=""; //切割域名 :类似:csdn.net, 51cto.com, cnblogs.com, iteye.com
- Pattern p=Pattern.compile("\\.([a-zA-Z0-9]+\\.[a-zA-Z]+)");
+ //Pattern p=Pattern.compile("\\.([a-zA-Z0-9]+\\.[a-zA-Z]+)");
+ Pattern p=Pattern.compile("((?!://)([a-zA-Z0-9-_]+\\.)*[a-zA-Z0-9][a-zA-Z0-9-_]+\\.[a-zA-Z]{2,11})");
Matcher m=p.matcher(url);
-
+
if(m.find()){
spiderName = m.group(1);
} else {