Python 官方文档:入门教程 => 点击学习
目录创建数据库: 新建SpringBoot项目: 1、配置依赖pom.xml 2、创建CmsContentPO.java 3、创建CrawlerMapper.java 4、配置映射文
WEBMagic是一个开源爬虫框架,本项目通过在springBoot项目中使用WebMagic去抓取数据,最后使用mybatis将数据入库。
本项目代码地址:ArticleCrawler: SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库 (gitee.com)
本示例中库名为article,表名为cms_content,表中包含contentId、title、date三个字段。
CREATE TABLE `cms_content` (
`contentId` varchar(40) NOT NULL COMMENT '内容ID',
`title` varchar(150) NOT NULL COMMENT '标题',
`date` varchar(150) NOT NULL COMMENT '发布日期',
PRIMARY KEY (`contentId`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="Http://Maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.5.5</version>
<relativePath/>
</parent>
<groupId>com.example</groupId>
<artifactId>Article</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Article</name>
<description>Article</description>
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.test.skip>true</maven.test.skip>
<maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
<maven.resources.plugin.version>3.1.0</maven.resources.plugin.version>
<Mysql.connector.version>5.1.47</mysql.connector.version>
<druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version>
<mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version>
<fastJSON.version>1.2.58</fastjson.version>
<commons.lang3.version>3.9</commons.lang3.version>
<joda.time.version>2.10.2</joda.time.version>
<webmagic.core.version>0.7.5</webmagic.core.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.connector.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>${druid.spring.boot.starter.version}</version>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>${mybatis.spring.boot.starter.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>${fastjson.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons.lang3.version}</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>${joda.time.version}</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>${webmagic.core.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>${maven.compiler.plugin.version}</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>${maven.resources.plugin.version}</version>
<configuration>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<fork>true</fork>
<addResources>true</addResources>
</configuration>
<executions>
<execution>
<Goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>public</id>
<name>aliyun nexus</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
</project>
数据实体,和表中3个字段对应。
package site.exciter.article.model;
public class CmsContentPO {
private String contentId;
private String title;
private String date;
public String getContentId() {
return contentId;
}
public void setContentId(String contentId) {
this.contentId = contentId;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
}
package site.exciter.article.dao;
import org.apache.ibatis.annotations.Mapper;
import site.exciter.article.model.CmsContentPO;
@Mapper
public interface CrawlerMapper {
int addCmsContent(CmsContentPO record);
}
在resources下新建mapper文件夹,在mapper下创建CrawlerMapper.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="site.exciter.article.dao.CrawlerMapper">
<insert id="addCmsContent" parameterType="site.exciter.article.model.CmsContentPO">
insert into cms_content (contentId,
title,
date)
values (#{contentId,jdbcType=VARCHAR},
#{title,jdbcType=VARCHAR},
#{date,jdbcType=VARCHAR})
</insert>
</mapper>
配置数据库和mybatis映射关系。
# mysql
spring.datasource.name=mysql
spring.datasource.type=com.alibaba.druid.pool.DruidDataSource
spring.datasource.driver-class-name=com.mysql.jdbc.Driver
spring.datasource.url=jdbc:mysql://10.201.61.184:3306/article?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true
spring.datasource.username=root
spring.datasource.passWord=root
# druid
spring.datasource.druid.initial-size=5
spring.datasource.druid.min-idle=5
spring.datasource.druid.max-active=10
spring.datasource.druid.max-wait=60000
spring.datasource.druid.validation-query=SELECT 1 FROM DUAL
spring.datasource.druid.test-on-borrow=false
spring.datasource.druid.test-on-return=false
spring.datasource.druid.test-while-idle=true
spring.datasource.druid.time-between-eviction-runs-millis=60000
spring.datasource.druid.min-evictable-idle-time-millis=300000
spring.datasource.druid.max-evictable-idle-time-millis=600000
# mybatis
mybatis.mapperLocations=classpath:mapper/CrawlerMapper.xml
解析html的逻辑。
package site.exciter.article;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
@Component
public class ArticlePageProcessor implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href";
String next_page_xpath = "//*[@id='nav_next_page']/a/@href";
String next_page_CSS = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)";
String title_xpath = "//h1[@class='postTitle']/a/span/text()";
String date_xpath = "//span[@id='post-date']/text()";
page.putField("title", page.getHtml().xpath(title_xpath).toString());
if (page.getResultItems().get("title") == null) {
page.setSkip(true);
}
page.putField("date", page.getHtml().xpath(date_xpath).toString());
if (page.getHtml().xpath(detail_urls_Xpath).match()) {
Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath);
page.addTargetRequests(detailUrls.all());
}
if (page.getHtml().xpath(next_page_xpath).match()) {
Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath);
page.addTargetRequests(nextPageUrl.all());
} else if (page.getHtml().css(next_page_css).match()) {
Selectable nextPageUrl = page.getHtml().css(next_page_css).links();
page.addTargetRequests(nextPageUrl.all());
}
}
@Override
public Site getSite() {
return site;
}
}
处理数据的持久化。
package site.exciter.article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import site.exciter.article.model.CmsContentPO;
import site.exciter.article.dao.CrawlerMapper;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.util.UUID;
@Component
public class ArticlePipeline implements Pipeline {
private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class);
@Autowired
private CrawlerMapper crawlerMapper;
public void process(ResultItems resultItems, Task task) {
String title = resultItems.get("title");
String date = resultItems.get("date");
CmsContentPO contentPO = new CmsContentPO();
contentPO.setContentId(UUID.randomUUID().toString());
contentPO.setTitle(title);
contentPO.setDate(date);
try {
boolean success = crawlerMapper.addCmsContent(contentPO) > 0;
LOGGER.info("保存成功:{}", title);
} catch (Exception ex) {
LOGGER.error("保存失败", ex);
}
}
}
执行抓取任务。
package site.exciter.article;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
@Component
public class ArticleTask {
private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class);
@Autowired
private ArticlePipeline articlePipeline;
@Autowired
private ArticlePageProcessor articlePageProcessor;
private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor();
public void crawl() {
// 定时任务,每10分钟爬取一次
timer.scheduleWithFixedDelay(() -> {
Thread.currentThread().setName("ArticleCrawlerThread");
try {
Spider.create(articlePageProcessor)
.addUrl("http://www.cnblogs.com/dick159/default.html?page=2")
// 抓取到的数据存数据库
.addPipeline(articlePipeline)
// 开启5个线程抓取
.thread(5)
// 异步启动爬虫
.start();
} catch (Exception ex) {
LOGGER.error("定时抓取数据线程执行异常", ex);
}
}, 0, 10, TimeUnit.MINUTES);
}
}
package site.exciter.article;
import org.mybatis.spring.annotation.MapperScan;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.CommandLineRunner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
@SpringBootApplication
@MapperScan(basePackages = "site.exciter.article.interface")
public class ArticleApplication implements CommandLineRunner {
@Autowired
private ArticleTask articleTask;
public static void main(String[] args) {
SpringApplication.run(ArticleApplication.class, args);
}
@Override
public void run(String... args) throws Exception {
articleTask.crawl();
}
}
到此这篇关于SrpingBoot+WebMagic+MyBaties实现爬虫和数据入库的示例的文章就介绍到这了,更多相关SrpingBoot+WebMagic+MyBaties爬虫和数据入库内容请搜索编程网以前的文章或继续浏览下面的相关文章希望大家以后多多支持编程网!
--结束END--
本文标题: SpringBoot+WebMagic+MyBaties实现爬虫和数据入库的示例
本文链接: https://lsjlt.com/news/154630.html(转载时请注明来源链接)
有问题或投稿请发送至: 邮箱/279061341@qq.com QQ/279061341
2024-03-01
2024-03-01
2024-03-01
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
2024-02-29
回答
回答
回答
回答
回答
回答
回答
回答
回答
回答
0