SGTY 5 місяців тому
коміт
f9ed703568
29 змінених файлів з 2932 додано та 0 видалено
  1. 33 0
      .gitignore
  2. 144 0
      pom.xml
  3. 13 0
      src/main/java/com/qizhen/spider/QizhenApplication.java
  4. 154 0
      src/main/java/com/qizhen/spider/ai/BaidubceUtil.java
  5. 118 0
      src/main/java/com/qizhen/spider/ai/BaidubceUtilTest.java
  6. 118 0
      src/main/java/com/qizhen/spider/ai/BaidubceUtilTest2.java
  7. 118 0
      src/main/java/com/qizhen/spider/ai/BaidubceUtilTest3.java
  8. 114 0
      src/main/java/com/qizhen/spider/ai/BaikeUtilTest.java
  9. 83 0
      src/main/java/com/qizhen/spider/ai/Knowlege.java
  10. 370 0
      src/main/java/com/qizhen/spider/ai/QizhenAssistant.java
  11. 67 0
      src/main/java/com/qizhen/spider/demos/web/BasicController.java
  12. 44 0
      src/main/java/com/qizhen/spider/demos/web/PathVariableController.java
  13. 43 0
      src/main/java/com/qizhen/spider/demos/web/User.java
  14. 196 0
      src/main/java/com/qizhen/spider/excel/ExcelProcess.java
  15. 65 0
      src/main/java/com/qizhen/spider/hyw/AlexanderMcqueenGoodsProcessor.java
  16. 19 0
      src/main/java/com/qizhen/spider/hyw/Doctor.java
  17. 37 0
      src/main/java/com/qizhen/spider/hyw/GithubRepo.java
  18. 40 0
      src/main/java/com/qizhen/spider/hyw/GithubRepoPageProcessor.java
  19. 62 0
      src/main/java/com/qizhen/spider/hyw/ZhihuPageProcessor.java
  20. 102 0
      src/main/java/com/qizhen/spider/hyw/hbrmHospital/HubeiMainDoctorNewPageProcessor.java
  21. 45 0
      src/main/java/com/qizhen/spider/hyw/tongjihospital/DepartmentPageProcessor.java
  22. 233 0
      src/main/java/com/qizhen/spider/hyw/tongjihospital/DoctorDataProcessor.java
  23. 37 0
      src/main/java/com/qizhen/spider/hyw/wenzhang/WenzhangPageProcessor.java
  24. 56 0
      src/main/java/com/qizhen/spider/hyw/whdxznhospital/DepartmentPageProcessor.java
  25. 129 0
      src/main/java/com/qizhen/spider/hyw/whdxznhospital/DoctorPageProcessor.java
  26. 81 0
      src/main/java/com/qizhen/spider/pipeline/ExcelPipeline.java
  27. 3 0
      src/main/resources/application.properties
  28. 395 0
      src/main/resources/static/index.html
  29. 13 0
      src/test/java/com/qizhen/spider/QizhenApplicationTests.java

+ 33 - 0
.gitignore

@@ -0,0 +1,33 @@
+HELP.md
+target/
+!.mvn/wrapper/maven-wrapper.jar
+!**/src/main/**/target/
+!**/src/test/**/target/
+
+### STS ###
+.apt_generated
+.classpath
+.factorypath
+.project
+.settings
+.springBeans
+.sts4-cache
+
+### IntelliJ IDEA ###
+.idea
+*.iws
+*.iml
+*.ipr
+
+### NetBeans ###
+/nbproject/private/
+/nbbuild/
+/dist/
+/nbdist/
+/.nb-gradle/
+build/
+!**/src/main/**/build/
+!**/src/test/**/build/
+
+### VS Code ###
+.vscode/

+ 144 - 0
pom.xml

@@ -0,0 +1,144 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>com.qizhen</groupId>
+    <artifactId>healsphere</artifactId>
+    <version>0.0.1-SNAPSHOT</version>
+    <name>qizhen</name>
+    <description>qizhen</description>
+    <properties>
+        <java.version>1.8</java.version>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+        <spring-boot.version>2.6.13</spring-boot.version>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.springframework.boot</groupId>
+            <artifactId>spring-boot-starter-web</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.springframework.boot</groupId>
+            <artifactId>spring-boot-starter-test</artifactId>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi-ooxml</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>cn.hutool</groupId>
+            <artifactId>hutool-all</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>us.codecraft</groupId>
+            <artifactId>webmagic-core</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>us.codecraft</groupId>
+            <artifactId>webmagic-extension</artifactId>
+        </dependency>
+        <dependency>
+            <artifactId>fastjson</artifactId>
+            <groupId>com.alibaba</groupId>
+        </dependency>
+        <dependency>
+            <groupId>org.projectlombok</groupId>
+            <artifactId>lombok</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>com.alibaba</groupId>
+            <artifactId>easyexcel</artifactId>
+        </dependency>
+    </dependencies>
+    <dependencyManagement>
+        <dependencies>
+            <dependency>
+                <groupId>org.springframework.boot</groupId>
+                <artifactId>spring-boot-dependencies</artifactId>
+                <version>${spring-boot.version}</version>
+                <type>pom</type>
+                <scope>import</scope>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.poi</groupId>
+                <artifactId>poi</artifactId>
+                <version>5.2.3</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.poi</groupId>
+                <artifactId>poi-ooxml</artifactId>
+                <version>5.2.3</version>
+            </dependency>
+            <dependency>
+                <groupId>cn.hutool</groupId>
+                <artifactId>hutool-all</artifactId>
+                <version>5.4.0</version>
+            </dependency>
+            <dependency>
+                <groupId>us.codecraft</groupId>
+                <artifactId>webmagic-core</artifactId>
+                <version>0.9.0</version>
+            </dependency>
+            <dependency>
+                <groupId>us.codecraft</groupId>
+                <artifactId>webmagic-extension</artifactId>
+                <version>0.9.0</version>
+            </dependency>
+            <dependency>
+                <artifactId>fastjson</artifactId>
+                <groupId>com.alibaba</groupId>
+                <version>1.2.83</version>
+            </dependency>
+            <dependency>
+                <groupId>org.projectlombok</groupId>
+                <artifactId>lombok</artifactId>
+                <version>1.18.24</version>
+                <scope>provided</scope>
+            </dependency>
+            <dependency>
+                <groupId>com.alibaba</groupId>
+                <artifactId>easyexcel</artifactId>
+                <version>4.0.3</version>
+            </dependency>
+        </dependencies>
+    </dependencyManagement>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.8.1</version>
+                <configuration>
+                    <source>1.8</source>
+                    <target>1.8</target>
+                    <encoding>UTF-8</encoding>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.springframework.boot</groupId>
+                <artifactId>spring-boot-maven-plugin</artifactId>
+                <version>${spring-boot.version}</version>
+                <configuration>
+                    <mainClass>com.qizhen.spider.QizhenApplication</mainClass>
+                    <skip>true</skip>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>repackage</id>
+                        <goals>
+                            <goal>repackage</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>

+ 13 - 0
src/main/java/com/qizhen/spider/QizhenApplication.java

@@ -0,0 +1,13 @@
+package com.qizhen.spider;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+
+@SpringBootApplication
+public class QizhenApplication {
+
+    public static void main(String[] args) {
+        SpringApplication.run(QizhenApplication.class, args);
+    }
+
+}

Різницю між файлами не показано, бо вона завелика
+ 154 - 0
src/main/java/com/qizhen/spider/ai/BaidubceUtil.java


Різницю між файлами не показано, бо вона завелика
+ 118 - 0
src/main/java/com/qizhen/spider/ai/BaidubceUtilTest.java


Різницю між файлами не показано, бо вона завелика
+ 118 - 0
src/main/java/com/qizhen/spider/ai/BaidubceUtilTest2.java


Різницю між файлами не показано, бо вона завелика
+ 118 - 0
src/main/java/com/qizhen/spider/ai/BaidubceUtilTest3.java


Різницю між файлами не показано, бо вона завелика
+ 114 - 0
src/main/java/com/qizhen/spider/ai/BaikeUtilTest.java


+ 83 - 0
src/main/java/com/qizhen/spider/ai/Knowlege.java

@@ -0,0 +1,83 @@
+package com.qizhen.spider.ai;
+
+import com.alibaba.excel.annotation.ExcelIgnore;
+import com.alibaba.excel.annotation.ExcelProperty;
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+@Getter
+@Setter
+@AllArgsConstructor
+@NoArgsConstructor
+public class Knowlege {
+    @ExcelProperty("实体")
+    private String entity;
+    @ExcelProperty("属性")
+    private String property;
+    @ExcelProperty("属性值")
+    private String value;
+    @ExcelProperty("原文")
+    private String text;
+    @ExcelProperty("icd10")
+    private String icd10;
+
+    public static void main(String[] args) {
+        String t = "```json\n" +
+                "[\n" +
+                "  {\n" +
+                "\t\"name\": \"沙丁胺醇\",\n" +
+                "\t\"description\": \"短效β2受体激动剂\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"特布他林\",\n" +
+                "\t\"description\": \"短效β2受体激动剂\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"布地奈德\",\n" +
+                "\t\"description\": \"吸入型糖皮质激素\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"异丙托溴铵\",\n" +
+                "\t\"description\": \"抗胆碱药\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"福莫特罗\",\n" +
+                "\t\"description\": \"长效β2受体激动剂\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"茚达特罗\",\n" +
+                "\t\"description\": \"长效β2受体激动剂\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"噻托溴铵\",\n" +
+                "\t\"description\": \"抗胆碱药\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"泼尼松龙\",\n" +
+                "\t\"description\": \"全身性糖皮质激素\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"孟鲁司特\",\n" +
+                "\t\"description\": \"白三烯调节剂\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"抗IgE抗体\",\n" +
+                "\t\"description\": \"生物制剂\"\n" +
+                "  },\n" +
+                "  {\n" +
+                "\t\"name\": \"抗IL-5抗体\",\n" +
+                "\t\"description\": \"生物制剂\"\n" +
+                "  }\n" +
+                "]\n" +
+                "```";
+        if(t.startsWith("```json")){
+            t = t.substring(7);
+        }
+        if(t.endsWith("```")){
+            t = t.substring(0,t.length()-3);
+        }
+        System.out.println(t);
+    }
+}

Різницю між файлами не показано, бо вона завелика
+ 370 - 0
src/main/java/com/qizhen/spider/ai/QizhenAssistant.java


+ 67 - 0
src/main/java/com/qizhen/spider/demos/web/BasicController.java

@@ -0,0 +1,67 @@
+/*
+ * Copyright 2013-2018 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.qizhen.spider.demos.web;
+
+import org.springframework.stereotype.Controller;
+import org.springframework.web.bind.annotation.ModelAttribute;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.ResponseBody;
+
+/**
+ * @author <a href="mailto:chenxilzx1@gmail.com">theonefx</a>
+ */
+@Controller
+public class BasicController {
+
+    // http://127.0.0.1:8080/hello?name=lisi
+    @RequestMapping("/hello")
+    @ResponseBody
+    public String hello(@RequestParam(name = "name", defaultValue = "unknown user") String name) {
+        return "Hello " + name;
+    }
+
+    // http://127.0.0.1:8080/user
+    @RequestMapping("/user")
+    @ResponseBody
+    public User user() {
+        User user = new User();
+        user.setName("theonefx");
+        user.setAge(666);
+        return user;
+    }
+
+    // http://127.0.0.1:8080/save_user?name=newName&age=11
+    @RequestMapping("/save_user")
+    @ResponseBody
+    public String saveUser(User u) {
+        return "user will save: name=" + u.getName() + ", age=" + u.getAge();
+    }
+
+    // http://127.0.0.1:8080/html
+    @RequestMapping("/html")
+    public String html() {
+        return "index.html";
+    }
+
+    @ModelAttribute
+    public void parseUser(@RequestParam(name = "name", defaultValue = "unknown user") String name
+            , @RequestParam(name = "age", defaultValue = "12") Integer age, User user) {
+        user.setName("zhangsan");
+        user.setAge(18);
+    }
+}

+ 44 - 0
src/main/java/com/qizhen/spider/demos/web/PathVariableController.java

@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013-2018 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.qizhen.spider.demos.web;
+
+import org.springframework.stereotype.Controller;
+import org.springframework.web.bind.annotation.PathVariable;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RequestMethod;
+import org.springframework.web.bind.annotation.ResponseBody;
+
+/**
+ * @author <a href="mailto:chenxilzx1@gmail.com">theonefx</a>
+ */
+@Controller
+public class PathVariableController {
+
+    // http://127.0.0.1:8080/user/123/roles/222
+    @RequestMapping(value = "/user/{userId}/roles/{roleId}", method = RequestMethod.GET)
+    @ResponseBody
+    public String getLogin(@PathVariable("userId") String userId, @PathVariable("roleId") String roleId) {
+        return "User Id : " + userId + " Role Id : " + roleId;
+    }
+
+    // http://127.0.0.1:8080/javabeat/somewords
+    @RequestMapping(value = "/javabeat/{regexp1:[a-z-]+}", method = RequestMethod.GET)
+    @ResponseBody
+    public String getRegExp(@PathVariable("regexp1") String regexp1) {
+        return "URI Part : " + regexp1;
+    }
+}

+ 43 - 0
src/main/java/com/qizhen/spider/demos/web/User.java

@@ -0,0 +1,43 @@
+/*
+ * Copyright 2013-2018 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.qizhen.spider.demos.web;
+
+/**
+ * @author <a href="mailto:chenxilzx1@gmail.com">theonefx</a>
+ */
+public class User {
+
+    private String name;
+
+    private Integer age;
+
+    public String getName() {
+        return name;
+    }
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public Integer getAge() {
+        return age;
+    }
+
+    public void setAge(Integer age) {
+        this.age = age;
+    }
+}

Різницю між файлами не показано, бо вона завелика
+ 196 - 0
src/main/java/com/qizhen/spider/excel/ExcelProcess.java


+ 65 - 0
src/main/java/com/qizhen/spider/hyw/AlexanderMcqueenGoodsProcessor.java

@@ -0,0 +1,65 @@
+package com.qizhen.spider.hyw;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.scheduler.PriorityScheduler;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class AlexanderMcqueenGoodsProcessor implements PageProcessor {
+
+    private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
+
+
+    public static final String URL_LIST = "http://www\\.alexandermcqueen\\.cn/.*";
+
+    public static final String URL_POST = "http://www\\.alexandermcqueen\\.cn/cn/\\w+/.*\\.html";
+
+    @Override
+    public void process(Page page) {
+        if (page.getUrl().regex(URL_POST).match()) {
+            page.putField("goodsName", page.getHtml().xpath("//div[@id='description']/h1/tidyText()"));
+            if (page.getResultItems().get("goodsName") == null) {
+                page.setSkip(true);
+            }
+            page.putField("currency", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='currency']/tidyText()"));
+            page.putField("goodsPrice", page.getHtml().xpath("//div[@id='description']//div[@class='itemBoxPrice']/span//span[@class='priceValue']/tidyText()"));
+            page.putField("description", page.getHtml()
+                    .xpath("//div[@id='tabbedDescription']//div[@class='tabbedDescription']//ul[@id='tabs']//li[@id='tab_description']/div[@id='description_pane']/tidyText()"));
+            page.putField("material", page.getHtml()
+                    .xpath("//div[@id='tabbedDescription']" +
+                            "//div[@class='tabbedDescription']" +
+                            "//ul[@id='tabs']" +
+                            "//li[@id='tab_description']" +
+                            "//div[@class='productProperty']" +
+                            "//div[@class='productPropertyRow']/span[2]/tidyText()"));
+            page.putField("goodsCode", page.getHtml()
+                    .xpath("//div[@id='tabbedDescription']" +
+                            "//div[@class='tabbedDescription']" +
+                            "//ul[@id='tabs']" +
+                            "//li[@id='tab_description']" +
+                            "//div[@class='productProperty']" +
+                            "//div[@class='productPropertyRow']//span[@id='modelFabricColorContainer']/tidyText()"));
+            page.putField("goodsSize", page.getHtml()
+                    .xpath("//div[@id='sizesContainer']//div[@id='sizes']//ul[@class='SizeW']"));
+            page.putField("goodsColors", page.getHtml()
+                    .xpath("//div[@id='colors']/ul/html()"));
+        } else {
+            page.addTargetRequests(page.getHtml().links().regex(URL_POST).all(), 1000);
+            page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all(), 1);
+        }
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public static void main(String[] args) {
+        Spider.create(new AlexanderMcqueenGoodsProcessor()).setScheduler(new PriorityScheduler())
+                .addUrl("http://www.alexandermcqueen.cn/sitemap.asp?tskay=E2F1A848").thread(5).run();
+    }
+}

+ 19 - 0
src/main/java/com/qizhen/spider/hyw/Doctor.java

@@ -0,0 +1,19 @@
+package com.qizhen.spider.hyw;
+
+import lombok.AllArgsConstructor;
+import lombok.Data;
+import lombok.NoArgsConstructor;
+
+@Data
+@AllArgsConstructor
+@NoArgsConstructor
+public class Doctor {
+    private String doctor_id;
+    private String doctor_name;
+    private String title;
+    private String des;
+    private String good_at;
+    private String header_pic;
+    private String work_time;
+    private String dept_id;
+}

+ 37 - 0
src/main/java/com/qizhen/spider/hyw/GithubRepo.java

@@ -0,0 +1,37 @@
+package com.qizhen.spider.hyw;
+
+/**
+ * @author code4crafer@gmail.com
+ */
+public class GithubRepo {
+
+    private String name;
+
+    private String author;
+
+    private String readme;
+
+    public String getName() {
+        return name;
+    }
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public String getAuthor() {
+        return author;
+    }
+
+    public void setAuthor(String author) {
+        this.author = author;
+    }
+
+    public String getReadme() {
+        return readme;
+    }
+
+    public void setReadme(String readme) {
+        this.readme = readme;
+    }
+}

+ 40 - 0
src/main/java/com/qizhen/spider/hyw/GithubRepoPageProcessor.java

@@ -0,0 +1,40 @@
+package com.qizhen.spider.hyw;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+/**
+ * @author code4crafter@gmail.com <br>
+ * @since 0.5.1
+ */
+public class GithubRepoPageProcessor implements PageProcessor {
+
+    private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
+
+    @Override
+    public void process(Page page) {
+        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all());
+        page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all());
+        GithubRepo githubRepo = new GithubRepo();
+        githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
+        githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString());
+        githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString());
+        if (githubRepo.getName() == null) {
+            //skip this page
+            page.setSkip(true);
+        } else {
+            page.putField("repo", githubRepo);
+        }
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public static void main(String[] args) {
+        Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
+    }
+}

+ 62 - 0
src/main/java/com/qizhen/spider/hyw/ZhihuPageProcessor.java

@@ -0,0 +1,62 @@
+package com.qizhen.spider.hyw;
+
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.pipeline.FilePipeline;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.Html;
+
+import java.util.List;
+
+/**
+ * @author 410775541@qq.com <br>
+ * @since 0.5.1
+ */
+public class ZhihuPageProcessor implements PageProcessor {
+
+    private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
+            .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0")
+            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
+            .addHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
+            .setCharset("UTF-8");
+
+    private static final int voteNum = 1000;
+
+
+    @Override
+    public void process(Page page) {
+
+        List<String> relativeUrl = page.getHtml().xpath("//li[@class='item clearfix']/div/a/@href").all();
+        page.addTargetRequests(relativeUrl);
+        relativeUrl = page.getHtml().xpath("//div[@id='zh-question-related-questions']//a[@class='question_link']/@href").all();
+        page.addTargetRequests(relativeUrl);
+        List<String> answers =  page.getHtml().xpath("//div[@id='zh-question-answer-wrap']/div").all();
+        boolean exist = false;
+        for(String answer:answers){
+            String vote = new Html(answer).xpath("//div[@class='zm-votebar']//span[@class='count']/text()").toString();
+            if(Integer.valueOf(vote) >= voteNum){
+                page.putField("vote",vote);
+                page.putField("content",new Html(answer).xpath("//div[@class='zm-editable-content']"));
+                page.putField("userid", new Html(answer).xpath("//a[@class='author-link']/@href"));
+                exist = true;
+            }
+        }
+        if(!exist){
+            page.setSkip(true);
+        }
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public static void main(String[] args) {
+        Spider.create(new ZhihuPageProcessor()).
+                addUrl("https://www.zhihu.com/search?type=question&q=java").
+                addPipeline(new FilePipeline("D:\\webmagic\\")).
+                thread(5).
+                run();
+    }
+}

+ 102 - 0
src/main/java/com/qizhen/spider/hyw/hbrmHospital/HubeiMainDoctorNewPageProcessor.java

@@ -0,0 +1,102 @@
+package com.qizhen.spider.hyw.hbrmHospital;
+
+import com.qizhen.spider.pipeline.ExcelPipeline;
+import org.apache.commons.lang3.StringUtils;
+import org.springframework.util.CollectionUtils;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+import us.codecraft.webmagic.selector.Selectable;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * @author 410775541@qq.com <br>
+ * @since 0.5.1
+ */
+public class HubeiMainDoctorNewPageProcessor implements PageProcessor {
+
+    private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(500).setTimeOut(3 * 60 * 1000)
+            .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36")
+            .addHeader("Accept", "*/*")
+            .addHeader("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
+            .setCharset("UTF-8");
+    String[] wordTimeArray = { "周一上午","周一下午", "周二上午","周二下午", "周三上午","周三下午", "周四上午","周四下午", "周五上午","周五下午", "周六上午","周六下午","周日上午","周日下午"};
+
+    @Override
+    public void process(Page page) {
+//        List<String> all = page.getHtml().xpath("//div[@class='Contentboxnav']//a/@href").all();
+//        Set<String> set = new HashSet<>(all);
+//        System.out.println(all);
+//        page.addTargetRequests(new ArrayList<>(set));
+        List<String> relativeUrl = page.getHtml().xpath("//div[@class='moredr']//a/@href").all();
+        page.addTargetRequests(relativeUrl);
+
+        List<String> doctorUrl = page.getHtml().xpath("//div[@class='dr-content']//div[@class='dr_list']//a/@href").all();
+        page.addTargetRequests(doctorUrl);
+
+        String doctorName =  page.getHtml().xpath("//div[@class='drout']//div[@class='dr_title']/span/text()").get();
+        if (StringUtils.isNotEmpty(doctorName)) {
+            page.putField("doctor_name",doctorName);
+            page.putField("header_pic",page.getHtml().xpath("//div[@class='drout']//div[@class='dr_img']/img/@src").get());
+            page.putField("position", page.getHtml().xpath("//div[@class='drout']//div[@class='drinoright pull-right']/div[3]//p/tidyText()"));
+
+            String desc1 = page.getHtml().xpath("//div[@class='drout']//div[@class='drinoright pull-right']/div[5]//tidyText()").get();
+
+            String desc2 = page.getHtml().xpath("//div[@class='drout']//div[@class='drinoright pull-right']/div[6]//tidyText()").get();
+
+            String desc = "";
+            if (StringUtils.isNotEmpty(desc1) && StringUtils.isNotEmpty(desc2)) {
+                desc = desc1 + "\n" + desc2;
+            }else if (StringUtils.isNotEmpty(desc1)) {
+                desc = desc1;
+            }else if (StringUtils.isNotEmpty(desc2)) {
+                desc = desc2;
+            }
+
+            page.putField("des",desc);
+            page.putField("good_at",page.getHtml().xpath("//div[@class='drout']//div[@class='drinoright pull-right']/div[4]//tidyText()").replace("诊疗专长","").replace("\n",""));
+            page.putField("dept_name",page.getHtml().xpath("//div[@class='drout']//div[@class='dr_title']/i[1]/a/text()"));
+            page.putField("title",page.getHtml().xpath("//div[@class='drout']//div[@class='dr_title']/i[2]/text()"));
+
+            List<Selectable> nodes = page.getHtml().xpath("/html/body/div[1]/div[2]/table/tbody/tr/").nodes();
+            String wordTime= "";
+            if(!CollectionUtils.isEmpty(nodes)) {
+                for (int i = 0; i < nodes.size(); i++) {
+                    if(i==0){
+                        continue;
+                    }
+                    //排版td节点下还有节点就是要坐诊
+                    String string = nodes.get(i).toString();
+                    if (string.contains("<i")) {
+                        wordTime+=wordTimeArray[i-1]+"、";
+                    }
+                }
+            }
+            if(StringUtils.isNotEmpty(wordTime)) {
+                page.putField("word_time", wordTime.substring(0, wordTime.length() - 1));
+            }
+        }else {
+            page.setSkip(true);
+        }
+
+    }
+
+    @Override
+    public Site getSite() {
+        return site;
+    }
+
+    public static void main(String[] args) {
+        List<String> titles = Arrays.asList("doctor_name","header_pic","title","position","des","good_at","dept_name","word_time");
+        Spider.create(new HubeiMainDoctorNewPageProcessor()).
+                addUrl("https://www.rmhospital.com/dept/yxw/doctorlist.html").
+//                addPipeline(new FilePipeline("D:\\webmagic\\")).
+//                addPipeline(new ConsolePipeline()).
+                addPipeline(new ExcelPipeline("D:\\webmagic\\湖北省人民医院主院区","湖北省人民医院主院区医生列表",titles)).
+                thread(1).
+                run();
+    }
+}

+ 45 - 0
src/main/java/com/qizhen/spider/hyw/tongjihospital/DepartmentPageProcessor.java

@@ -0,0 +1,45 @@
+package com.qizhen.spider.hyw.tongjihospital;
+
+import com.qizhen.spider.pipeline.ExcelPipeline;
+import org.jsoup.internal.StringUtil;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * 华中科技大学同济医学院附属同济医院
+ */
+public class DepartmentPageProcessor implements PageProcessor {
+
+        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
+
+        @Override
+        public void process(Page page) {
+            page.addTargetRequests(page.getHtml().links().regex("http://www\\.tjh\\.com\\.cn/channels/\\d+\\.html").all());
+            String url = page.getUrl().toString();
+            page.putField("id",url );
+            page.putField("dept_name", page.getHtml().xpath("/html/body/div[4]/div[1]/div/h3/text()").toString());
+            String des = page.getHtml().xpath("//div[@id='cstm_scroll1']/tidyText()").toString();
+            page.putField("des",des);
+            page.putField("rec_des", page.getHtml().xpath("//div[@id='cstm_scroll2']/tidyText()").toString());
+            page.putField("dept_img", "https://www.tjh.com.cn/"+page.getHtml().xpath("/html/body/div[4]/div[2]/div/img/@src").toString());
+            if (StringUtil.isBlank(des)){
+                page.setSkip(true);
+            }
+        }
+
+        @Override
+        public Site getSite() {
+            return site;
+        }
+
+        public static void main(String[] args) throws Exception {
+            List<String> titles = Arrays.asList("id","dept_name","rec_des","des","dept_img");
+            Spider.create(new DepartmentPageProcessor()).addUrl("http://www.tjh.com.cn/DeptNav/Index.html")
+                    .addPipeline(new ExcelPipeline("C:\\Users\\17664\\Desktop\\hospitaldata","科室-华中科技大学同济医学院附属同济医院",titles)).thread(5).run();
+        }
+    }

Різницю між файлами не показано, бо вона завелика
+ 233 - 0
src/main/java/com/qizhen/spider/hyw/tongjihospital/DoctorDataProcessor.java


+ 37 - 0
src/main/java/com/qizhen/spider/hyw/wenzhang/WenzhangPageProcessor.java

@@ -0,0 +1,37 @@
+package com.qizhen.spider.hyw.wenzhang;
+
+import com.qizhen.spider.pipeline.ExcelPipeline;
+import org.jsoup.internal.StringUtil;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * 科室数据抓取
+ * 武汉大学中南医院
+ */
+public class WenzhangPageProcessor implements PageProcessor {
+
+        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
+
+        @Override
+        public void process(Page page) {
+            String des = page.getHtml().xpath("//div[@class='article-content']/tidyText()").toString();
+            System.out.println(des);
+        }
+
+        @Override
+        public Site getSite() {
+            return site;
+        }
+
+        public static void main(String[] args) throws Exception {
+            List<String> titles = Arrays.asList("id","dept_name","des","dept_img");
+            Spider.create(new WenzhangPageProcessor()).addUrl("http://www.app.dawuhanapp.com/p/32592291.html")
+                    .addPipeline(new ExcelPipeline("C:\\Users\\17664\\Desktop\\hospitaldata","test",titles)).thread(5).run();
+        }
+    }

+ 56 - 0
src/main/java/com/qizhen/spider/hyw/whdxznhospital/DepartmentPageProcessor.java

@@ -0,0 +1,56 @@
+package com.qizhen.spider.hyw.whdxznhospital;
+
+import com.qizhen.spider.pipeline.ExcelPipeline;
+import org.jsoup.internal.StringUtil;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * 科室数据抓取
+ * 武汉大学中南医院
+ */
+public class DepartmentPageProcessor implements PageProcessor {
+
+        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
+
+        @Override
+        public void process(Page page) {
+            List<String> all = page.getHtml().links().regex("https://www\\.znhospital\\.cn/[a-zA-Z_]+\\.html").all();
+            page.addTargetRequests(all);
+            String url = page.getUrl().toString();
+            //取出无效科室
+            if(url.contains("ningyy.html") || url.contains("dwbgs.html")|| url.contains("jjjcbgs.html")
+                    || url.contains("dwxcb.html")|| url.contains("kyc.html")|| url.contains("yyzlyaqglbgs.html")|| url.contains("xxzx.html")
+                    || url.contains("sbc.html")|| url.contains("lcsyzx.html")|| url.contains("ztbglbgs.html")){
+                page.setSkip(true);
+                return;
+
+            }
+            page.putField("id",url);
+            String deptName = page.getHtml().xpath("/html/body/div[2]/div[3]/div[1]/div[2]/div/div[2]/div[1]/div[1]/div/text()").toString();
+            if (StringUtil.isBlank(deptName)){
+                page.setSkip(true);
+                return;
+            }
+            page.putField("dept_name", deptName);
+            String des = page.getHtml().xpath("//div[@class='scroll-mod white']/div[@class='desc']/tidyText()").toString();
+            page.putField("des",des);
+            page.putField("dept_img", "https://www.znhospital.cn"+page.getHtml().xpath("/html/body/div[2]/div[3]/div[1]/div[2]/div/div[1]/img/@src").toString());
+        }
+
+        @Override
+        public Site getSite() {
+            return site;
+        }
+
+        public static void main(String[] args) throws Exception {
+            List<String> titles = Arrays.asList("id","dept_name","des","dept_img");
+            Spider.create(new DepartmentPageProcessor()).addUrl("https://www.znhospital.cn/department_navigation.html")
+                    .addPipeline(new ExcelPipeline("C:\\Users\\17664\\Desktop\\hospitaldata\\武汉大学中南医院","科室",titles)).thread(5).run();
+        }
+    }

+ 129 - 0
src/main/java/com/qizhen/spider/hyw/whdxznhospital/DoctorPageProcessor.java

@@ -0,0 +1,129 @@
+package com.qizhen.spider.hyw.whdxznhospital;
+
+import com.qizhen.spider.pipeline.ExcelPipeline;
+import org.jsoup.internal.StringUtil;
+import org.springframework.util.CollectionUtils;
+import us.codecraft.webmagic.Page;
+import us.codecraft.webmagic.Site;
+import us.codecraft.webmagic.Spider;
+import us.codecraft.webmagic.processor.PageProcessor;
+
+import java.util.*;
+
+/**
+ * 医生数据抓取
+ * 武汉大学中南医院
+ */
+public class DoctorPageProcessor implements PageProcessor {
+
+        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
+        @Override
+        public void process(Page page) {
+            List<String> pages = page.getHtml().links().regex("https://www\\.znhospital\\.cn/[a-zA-Z_]+/expert_team/p/\\d+\\.html").all();
+            if(!CollectionUtils.isEmpty(pages) && pages.size()>0){
+                System.out.println(pages.size());
+            }
+            List<String> details = page.getHtml().links().regex("https://www\\.znhospital\\.cn/[a-zA-Z_]+/expert_team/detail/\\d+\\.html").all();
+            if(CollectionUtils.isEmpty(details) && CollectionUtils.isEmpty(pages)) {
+                return;
+            }
+            page.addTargetRequests(pages);
+            page.addTargetRequests(details);
+            String url = page.getUrl().toString();
+            page.putField("doctor_id",url );
+            System.out.println(page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div").toString());
+            String doctor_name = page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[1]/div[1]/div[1]/text()").toString();
+            if (StringUtil.isBlank(doctor_name)){
+                page.setSkip(true);
+                return;
+            }
+            page.putField("doctor_name", doctor_name);
+            page.putField("header_pic","https://www.znhospital.cn"+page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[2]/img/@src"));
+            page.putField("title",page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[1]/div[1]/div[2]/text()").toString());
+            page.putField("des",page.getHtml().xpath("//div[@class='scroll-mod white']//div[@class='txt']/tidyText()").toString());
+            page.putField("good_at",page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]//div[@class='txt']/tidyText()").toString());
+            page.putField("dept_id", page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[1]/div[2]/div[1]/text()").toString());
+        }
+
+        @Override
+        public Site getSite() {
+            return site;
+        }
+
+        public static void main(String[] args) throws Exception {
+            List<String> titles = Arrays.asList("doctor_id","doctor_name","header_pic","title","des","good_at","dept_id");
+            String[] deptUrlArray = deptUrls.split(",");
+            Spider.create(new DoctorPageProcessor()).addUrl(deptUrlArray)
+                    .addPipeline(new ExcelPipeline("C:\\Users\\17664\\Desktop\\hospitaldata\\武汉大学中南医院","医生",titles)).thread(5).run();
+        }
+
+    public static String deptUrls = "https://www.znhospital.cn/zhylk/expert_team.html         ,"
+            +"https://www.znhospital.cn/JGXXZBZX/expert_team.html      ,"
+            +"https://www.znhospital.cn/gdywk/expert_team.html         ,"
+            +"https://www.znhospital.cn/xxgwk_/expert_team.html        ,"
+            +"https://www.znhospital.cn/wcwk/expert_team.html          ,"
+            +"https://www.znhospital.cn/gdjbyjy/expert_team.html       ,"
+            +"https://www.znhospital.cn/JZCGMWK/expert_team.html       ,"
+            +"https://www.znhospital.cn/xwk/expert_team.html           ,"
+            +"https://www.znhospital.cn/mnwk/expert_team.html          ,"
+            +"https://www.znhospital.cn/jrzlzxzljrzlzx/expert_team.html,"
+            +"https://www.znhospital.cn/SJWK/expert_team.html          ,"
+            +"https://www.znhospital.cn/JZXRXWK/expert_team.html       ,"
+            +"https://www.znhospital.cn/yk/expert_team.html            ,"
+            +"https://www.znhospital.cn/kqk/expert_team.html           ,"
+            +"https://www.znhospital.cn/ebyhtjwk/expert_team.html      ,"
+            +"https://www.znhospital.cn/opok/expert_team.html          ,"
+            +"https://www.znhospital.cn/tengtk/expert_team.html        ,"
+            +"https://www.znhospital.cn/fcekflk/expert_team.html       ,"
+            +"https://www.znhospital.cn/ZXMRK/expert_team.html         ,"
+            +"https://www.znhospital.cn/fcekszyxzx/expert_team.html    ,"
+            +"https://www.znhospital.cn/FK/expert_team.html            ,"
+            +"https://www.znhospital.cn/fck/expert_team.html           ,"
+            +"https://www.znhospital.cn/csyxwgk/expert_team.html       ,"
+            +"https://www.znhospital.cn/fcekxewk/expert_team.html      ,"
+            +"https://www.znhospital.cn/gjyydyxk/expert_team.html      ,"
+            +"https://www.znhospital.cn/xxgnk/expert_team.html         ,"
+            +"https://www.znhospital.cn/jzygzlk/expert_team.html       ,"
+            +"https://www.znhospital.cn/sjnk/expert_team.html          ,"
+            +"https://www.znhospital.cn/hxywzzyxk/expert_team.html     ,"
+            +"https://www.znhospital.cn/sbnk/expert_team.html          ,"
+            +"https://www.znhospital.cn/zxyjhk/expert_team.html        ,"
+            +"https://www.znhospital.cn/nfmnk/expert_team.html         ,"
+            +"https://www.znhospital.cn/pfk/expert_team.html           ,"
+            +"https://www.znhospital.cn/xhnk/expert_team.html          ,"
+            +"https://www.znhospital.cn/grk/expert_team.html           ,"
+            +"https://www.znhospital.cn/sjkfk/expert_team.html         ,"
+            +"https://www.znhospital.cn/gkkfk/expert_team.html         ,"
+            +"https://www.znhospital.cn/xynk/expert_team.html          ,"
+            +"https://www.znhospital.cn/fsmyk/expert_team.html         ,"
+            +"https://www.znhospital.cn/sjxlk/expert_team.html         ,"
+            +"https://www.znhospital.cn/lnyxk/expert_team.html         ,"
+            +"https://www.znhospital.cn/qkyxk/expert_team.html         ,"
+            +"https://www.znhospital.cn/zlfszlzx/expert_team.html      ,"
+            +"https://www.znhospital.cn/tjyetzlk/expert_team.html      ,"
+            +"https://www.znhospital.cn/rxymnzlk/expert_team.html      ,"
+            +"https://www.znhospital.cn/gmfyk/expert_team.html         ,"
+            +"https://www.znhospital.cn/sgzgylbzlk/expert_team.html    ,"
+            +"https://www.znhospital.cn/fbzlk/expert_team.html         ,"
+            +"https://www.znhospital.cn/cwzlk/expert_team.html         ,"
+            +"https://www.znhospital.cn/fcekek/expert_team.html        ,"
+            +"https://www.znhospital.cn/fkzlk/expert_team.html         ,"
+            +"https://www.znhospital.cn/yxyxk/expert_team.html         ,"
+            +"https://www.znhospital.cn/fccsyzk/expert_team.html       ,"
+            +"https://www.znhospital.cn/xzcsyzk/expert_team.html       ,"
+            +"https://www.znhospital.cn/blk/expert_team.html           ,"
+            +"https://www.znhospital.cn/yxjyk/expert_team.html         ,"
+            +"https://www.znhospital.cn/hyxk/expert_team.html          ,"
+            +"https://www.znhospital.cn/shuxk/expert_team.html         ,"
+            +"https://www.znhospital.cn/yaoxueb/expert_team.html       ,"
+            +"https://www.znhospital.cn/jkglk/expert_team.html         ,"
+            +"https://www.znhospital.cn/zhcsyzk/expert_team.html       ,"
+            +"https://www.znhospital.cn/jjzx/expert_team.html          ,"
+            +"https://www.znhospital.cn/zzyxk/expert_team.html         ,"
+            +"https://www.znhospital.cn/mzsss/expert_team.html         ," +
+            "https://www.znhospital.cn/gdyygrzzzlk.html,"
+            +"https://www.znhospital.cn/lcyjs/expert_team.html         ";
+
+
+
+}

+ 81 - 0
src/main/java/com/qizhen/spider/pipeline/ExcelPipeline.java

@@ -0,0 +1,81 @@
+package com.qizhen.spider.pipeline;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import us.codecraft.webmagic.ResultItems;
+import us.codecraft.webmagic.Task;
+import us.codecraft.webmagic.pipeline.Pipeline;
+import us.codecraft.webmagic.utils.FilePersistentBase;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.List;
+
+public class ExcelPipeline extends FilePersistentBase implements Pipeline {
+
+    private String filename;//文件名
+    private int rows = 0;//当前要编辑的行
+    private HSSFWorkbook workbook;//工作蒲
+    private HSSFSheet sheet;//工作表
+    private List<String> titles;
+    //构造方法
+    public ExcelPipeline(String path,String name,List<String> titles) {
+        this.titles = titles;
+        //设置保存路径
+        setPath(path);
+
+        //设置文件名是日期格式
+        filename = name+new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) + ".xls";
+        //创建工作薄对象
+        workbook = new HSSFWorkbook();//这里也可以设置sheet的Name
+        //创建工作表对象
+        sheet = workbook.createSheet(name);
+
+        //创建工作表的行
+        HSSFRow row = sheet.createRow(rows);
+
+        //创建标题
+        for(int i=0;i<titles.size();i++) {
+            row.createCell(i).setCellValue(titles.get(i));
+        }
+        //每写完一行我们就要开始写下一行
+        rows++;
+
+
+    }
+
+    @Override
+    public void process(ResultItems resultItems, Task task) {
+        //创建工作表的行
+        HSSFRow row = sheet.createRow(rows);
+        //提取保存的内容
+        for(int i=0;i<titles.size();i++) {
+            //输出
+            row.createCell(i).setCellValue(resultItems.get(titles.get(i))==null?"":resultItems.get(titles.get(i)).toString());
+        }
+        rows++;
+        //写完之后保存
+        save();
+    }
+
+    /**
+     * 保存表格
+     **/
+    private synchronized void save() {
+        try {
+            //文档输出
+            FileOutputStream out = new FileOutputStream(getFile(this.path + filename));
+            workbook.write(out);
+            out.close();
+            System.out.println(this.path + filename + "存储完毕");
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+
+}

+ 3 - 0
src/main/resources/application.properties

@@ -0,0 +1,3 @@
+# 应用服务 WEB 访问端口
+server.port=8080
+

Різницю між файлами не показано, бо вона завелика
+ 395 - 0
src/main/resources/static/index.html


+ 13 - 0
src/test/java/com/qizhen/spider/QizhenApplicationTests.java

@@ -0,0 +1,13 @@
+package com.qizhen.spider;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.boot.test.context.SpringBootTest;
+
+@SpringBootTest
+class QizhenApplicationTests {
+
+    @Test
+    void contextLoads() {
+    }
+
+}