diff --git a/WebContent/img/baijiahao_logo.png b/WebContent/img/baijiahao_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..71dc9cc7e44b0638d167642c2b894b6263d47c7f Binary files /dev/null and b/WebContent/img/baijiahao_logo.png differ diff --git a/WebContent/img/bd_developer_logo.png b/WebContent/img/bd_developer_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..2c32468a059eb833ed0a67fd104971120c29a721 Binary files /dev/null and b/WebContent/img/bd_developer_logo.png differ diff --git a/WebContent/img/wechat_logo.png b/WebContent/img/wechat_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..892de473b30378d13e15daf342c7b401c886a1f7 Binary files /dev/null and b/WebContent/img/wechat_logo.png differ diff --git a/WebContent/index.html b/WebContent/index.html index 618c88f8de5b3a4e713d7444ef59fb5663860ee9..6cad920dfb03cdadb4831ce5f110051dc702da7f 100644 --- a/WebContent/index.html +++ b/WebContent/index.html @@ -53,6 +53,9 @@ src="img/wordpress-logo.jpg" />
  • +
  • +
  • +
  • diff --git a/src/common/Spider.xml b/src/common/Spider.xml index cf674b4c28f6d008388fc610e312a90f4da39941..ca98fcc4b6258de17c12c4de3896a139888ad3b4 100644 --- a/src/common/Spider.xml +++ b/src/common/Spider.xml @@ -188,7 +188,7 @@ - wordpress + wordpress ]]> @@ -214,7 +214,7 @@ ]]> - + @@ -230,4 +230,90 @@ cpp + + + + mp.weixin.qq.com + utf-8 + weixin.qq.com + /s/ + + + + + + + + + + + + + + + + + ]]> + ]]> + ]]> + + + + + + developer.baidu.com + utf-8 + baidu.com + /topic/show/ + + + + + + + + + + + + + + + + + ]]> + ]]> + ]]> + + + + + + baijiahao.baidu.com + utf-8 + baidu.com + /s + + + + + + + + + + + + + + + + + + + ]]> + ]]> + ]]> + + \ No newline at end of file diff --git a/src/common/SpiderConfigTool.java b/src/common/SpiderConfigTool.java index 99ce6985beb4e52c741eab0aa4718504822adc88..e6924629597a106ca4e68a2832cf19cd7be3dc57 100644 --- a/src/common/SpiderConfigTool.java +++ b/src/common/SpiderConfigTool.java @@ -15,7 +15,7 @@ public class SpiderConfigTool { private SAXReader reader; private Document doc; private Node spiderNode; - + public SpiderConfigTool(String spiderName) throws DocumentException{ file = new File(this.getClass().getResource("").getPath()+configPath); @@ -24,29 +24,30 @@ public class SpiderConfigTool { } reader = new SAXReader(); doc = reader.read(file); - spiderNode = getSpider(spiderName); - + spiderNode = getSpider(spiderName); + } - + public Node getSpiderNode(){ return spiderNode; } - + @SuppressWarnings("unchecked") private Node getSpider(String spiderName){ List list = doc.selectNodes("config/spider-cofig"); - + for(Node i : list){ - if(i.selectSingleNode("name").getText().equals(spiderName)){ + Node domain = i.selectSingleNode("domain"); + if (domain != null && domain.getText().equals(spiderName)) { return i; } } - + return null; } - + public Document getDoc(){ return doc; } - + } diff --git a/src/spider/BlogPageProcessor.java b/src/spider/BlogPageProcessor.java index eb783b68062538f1aab0147050f64b89cc219e98..99802be3d426cd19b3a443cb02456248307183a1 100644 --- a/src/spider/BlogPageProcessor.java +++ b/src/spider/BlogPageProcessor.java @@ -54,9 +54,10 @@ public class BlogPageProcessor implements PageProcessor{ String spiderName=""; //切割域名 :类似:csdn.net, 51cto.com, cnblogs.com, iteye.com - Pattern p=Pattern.compile("\\.([a-zA-Z0-9]+\\.[a-zA-Z]+)"); + //Pattern p=Pattern.compile("\\.([a-zA-Z0-9]+\\.[a-zA-Z]+)"); + Pattern p=Pattern.compile("((?!://)([a-zA-Z0-9-_]+\\.)*[a-zA-Z0-9][a-zA-Z0-9-_]+\\.[a-zA-Z]{2,11})"); Matcher m=p.matcher(url); - + if(m.find()){ spiderName = m.group(1); } else {