|
@@ -0,0 +1,129 @@
|
|
|
+package com.qizhen.spider.hyw.whdxznhospital;
|
|
|
+
|
|
|
+import com.qizhen.spider.pipeline.ExcelPipeline;
|
|
|
+import org.jsoup.internal.StringUtil;
|
|
|
+import org.springframework.util.CollectionUtils;
|
|
|
+import us.codecraft.webmagic.Page;
|
|
|
+import us.codecraft.webmagic.Site;
|
|
|
+import us.codecraft.webmagic.Spider;
|
|
|
+import us.codecraft.webmagic.processor.PageProcessor;
|
|
|
+
|
|
|
+import java.util.*;
|
|
|
+
|
|
|
+/**
|
|
|
+ * 医生数据抓取
|
|
|
+ * 武汉大学中南医院
|
|
|
+ */
|
|
|
+public class DoctorPageProcessor implements PageProcessor {
|
|
|
+
|
|
|
+ private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
|
|
|
+ @Override
|
|
|
+ public void process(Page page) {
|
|
|
+ List<String> pages = page.getHtml().links().regex("https://www\\.znhospital\\.cn/[a-zA-Z_]+/expert_team/p/\\d+\\.html").all();
|
|
|
+ if(!CollectionUtils.isEmpty(pages) && pages.size()>0){
|
|
|
+ System.out.println(pages.size());
|
|
|
+ }
|
|
|
+ List<String> details = page.getHtml().links().regex("https://www\\.znhospital\\.cn/[a-zA-Z_]+/expert_team/detail/\\d+\\.html").all();
|
|
|
+ if(CollectionUtils.isEmpty(details) && CollectionUtils.isEmpty(pages)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ page.addTargetRequests(pages);
|
|
|
+ page.addTargetRequests(details);
|
|
|
+ String url = page.getUrl().toString();
|
|
|
+ page.putField("doctor_id",url );
|
|
|
+ System.out.println(page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div").toString());
|
|
|
+ String doctor_name = page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[1]/div[1]/div[1]/text()").toString();
|
|
|
+ if (StringUtil.isBlank(doctor_name)){
|
|
|
+ page.setSkip(true);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ page.putField("doctor_name", doctor_name);
|
|
|
+ page.putField("header_pic","https://www.znhospital.cn"+page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[2]/img/@src"));
|
|
|
+ page.putField("title",page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[1]/div[1]/div[2]/text()").toString());
|
|
|
+ page.putField("des",page.getHtml().xpath("//div[@class='scroll-mod white']//div[@class='txt']/tidyText()").toString());
|
|
|
+ page.putField("good_at",page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]//div[@class='txt']/tidyText()").toString());
|
|
|
+ page.putField("dept_id", page.getHtml().xpath("/html/body/div[2]/div[3]/div[2]/div/div/div[3]/div[1]/div[2]/div[1]/text()").toString());
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public Site getSite() {
|
|
|
+ return site;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void main(String[] args) throws Exception {
|
|
|
+ List<String> titles = Arrays.asList("doctor_id","doctor_name","header_pic","title","des","good_at","dept_id");
|
|
|
+ String[] deptUrlArray = deptUrls.split(",");
|
|
|
+ Spider.create(new DoctorPageProcessor()).addUrl(deptUrlArray)
|
|
|
+ .addPipeline(new ExcelPipeline("C:\\Users\\17664\\Desktop\\hospitaldata\\武汉大学中南医院","医生",titles)).thread(5).run();
|
|
|
+ }
|
|
|
+
|
|
|
+ public static String deptUrls = "https://www.znhospital.cn/zhylk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/JGXXZBZX/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/gdywk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/xxgwk_/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/wcwk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/gdjbyjy/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/JZCGMWK/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/xwk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/mnwk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/jrzlzxzljrzlzx/expert_team.html,"
|
|
|
+ +"https://www.znhospital.cn/SJWK/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/JZXRXWK/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/yk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/kqk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/ebyhtjwk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/opok/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/tengtk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fcekflk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/ZXMRK/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fcekszyxzx/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/FK/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fck/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/csyxwgk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fcekxewk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/gjyydyxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/xxgnk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/jzygzlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/sjnk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/hxywzzyxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/sbnk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/zxyjhk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/nfmnk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/pfk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/xhnk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/grk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/sjkfk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/gkkfk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/xynk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fsmyk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/sjxlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/lnyxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/qkyxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/zlfszlzx/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/tjyetzlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/rxymnzlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/gmfyk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/sgzgylbzlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fbzlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/cwzlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fcekek/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fkzlk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/yxyxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/fccsyzk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/xzcsyzk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/blk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/yxjyk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/hyxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/shuxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/yaoxueb/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/jkglk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/zhcsyzk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/jjzx/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/zzyxk/expert_team.html ,"
|
|
|
+ +"https://www.znhospital.cn/mzsss/expert_team.html ," +
|
|
|
+ "https://www.znhospital.cn/gdyygrzzzlk.html,"
|
|
|
+ +"https://www.znhospital.cn/lcyjs/expert_team.html ";
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+}
|