首先添加Markdown 转 Html依赖:
<!--Markdown 转 Html,markdown解析框架--><dependency><groupId>com.vladsch.flexmark</groupId><artifactId>flexmark-all</artifactId><version>0.62.2</version></dependency>
html中添加url的输入框(我使用的是layui2.7.6.js):
<div class="layui-form-pane"><div class="layui-form-item layui-form-text"><label class="layui-form-label"><a href="javascript:void(0)" onclick="piliangReptile()"><i class="fa fa-github" style="color:red;font-size: x-large;"></i>批量抓取文章</a><a href="javascript:void(0)" style="margin-left: 80%;" onclick="reptile()"><i class="fa fa-github" style="color:red;font-size: x-large;"></i>抓取单篇文章</a></label><div class="layui-input-block"><textarea id="zhuaqu" class="layui-textarea"placeholder=" 抓取单篇文章URL示例:(文章ID)。 列表抓取URL示例:(页码非必填)"></textarea></div></div>
</div>
JS:
//抓取文章
function reptile(){var loadIndex = layer.load(2, {shade: [0.1,'#fff'] //0.1透明度的白色背景});$.post('/homeReptile', {url: $('#zhuaqu').val()}, function(data){layer.close(loadIndex);//关闭加载层layer.ssage);});
}//抓取批量文章
function piliangReptile(){var loadIndex = layer.load(2, {shade: [0.1,'#fff'] //0.1透明度的白色背景});$.get('/piliangReptile', {url: $('#zhuaqu').val()}, function(data){layer.close(loadIndex);//关闭加载层layer.ssage);});
}
controller:
//首页文章、ip用户、留言统计 ---首页各种统计信息 服务器监控.redis监控 ---请访问HomeController@PostMapping(value = "/homeReptile")public ResponseResult reptile(String url) {if(StringUtils.isBlank(url)){("无效的URL");}ptile(url);}@GetMapping(value = "/piliangReptile")public ResponseResult piliangReptile(String url) {String content = getWebpageContent(url); // 抓取页面的内容ArrayList<String> urls = new ArrayList<>();int succint = 0 ;int errint = 0 ;// 根据观察,找到需要抓取的文章列表的范围int start = content.indexOf("<div class="article-list">");int end = content.indexOf("</main>");// 如果没找到,返回if (start < 0 || end < 0)(String());// 对数据进行筛选,移除空行。ArrayList<String> list = new ArrayList<>();for (String line : content.substring(start, end).split("n"))if (im().length() > 0)list.im());// 提取内容for (int i = 0; i < list.size(); i++) {if ((i).contains("article/details/") && (i + 1).contains("<span class")) {String aurl = (i).split(""")[1];ains("/")){ResponseResult a = ptile(aurl);if (a.getCode() == 200){succint++;}else{errint++;}}}}log.info("成功抓取"+succint+"篇文章,失败"+errint+"篇!");return new ResponseResult(200,"成功抓取"+succint+"篇文章,失败"+errint+"篇!", url);}// 从指定网址获取网页的内容,返回为网站的HTML字符串。public static String getWebpageContent(String url) {try {URL u = new URL(url);HttpURLConnection conn = (HttpURLConnection) u.openConnection();t();BufferedReader br = new BufferedReader(new InputStream(), StandardCharsets.UTF_8));StringBuilder sb = new StringBuilder();String str;while ((str = br.readLine()) != null) {sb.append(str).append("n");}br.close();conn.disconnect();String();} catch (Exception e) {e.printStackTrace();}return "";}
使用到的统一返回类:
package com.shiyimon;import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.Data;import java.util.HashMap;
import java.util.Map;import static com.shiyimon.ResultCode.*;/*** <p> 统一返回结果类 </p>** @description :* @author : blue*/
@ApiModel(value = "统一返回结果类")
@Data
public class ResponseResult {/*** 消息内容*/@ApiModelProperty(value = "响应消息", required = false)private String message;/*** 响应码:参考`ResultCode`*/@ApiModelProperty(value = "响应码", required = true)private Integer code;/*** 响应中的数据*/@ApiModelProperty(value = "响应数据", required = false)private Object data;@ApiModelProperty(value = "响应数据", required = false)private Map<String,Object> extra = new HashMap<>();public ResponseResult putExtra(String key, Object value) {a.put(key, value);return this;}public static ResponseResult error(String message) {return new Code(), message, null);}public static ResponseResult error() {return new Code(), Desc(), null);}public static ResponseResult error(Integer code, String message) {return new ResponseResult(code, message, null);}public static ResponseResult success() {return new Code(), Desc(), null);}public static ResponseResult success(Object data) {return new Code(),Desc(), data);}public static ResponseResult success(String message, Object data) {return new Code(), message, data);}public static ResponseResult success(Integer code, String message, Object data) {return new ResponseResult(code, message, data);}public static ResponseResult success(Integer code, String message) {return new ResponseResult(code, message,null);}public ResponseResult(Integer code, String msg, Object data) {de = ssage = msg;this.data = data;}
}
service:
/*** 抓取文章* @return*/@Override@Transactional(rollbackFor = Exception.class)public ResponseResult reptile(String url) {try {Document document = t(url).get();Elements title = ElementsByClass("title-article");Elements tags = ElementsByClass("tag-link");Elements content = ElementsByClass("article_content");if (StringUtils.String())) {("文章抓取失败");}//爬取的是HTML内容,需要转成MD格式的内容String newContent = (0).toString().replaceAll("<code>", "<code class="lang-java">");MutableDataSet options = new MutableDataSet();String markdown = FlexmarkHtmlConverter.builder(options).build().convert(newContent).replace("lang-java","java");//把抓取下来的文章赋值到实体BlogArticle entity = BlogArticle.builder().userId(7L).contentMd(markdown).categoryId(OTHER_CATEGORY_ID).isOriginal(Code()).originalUrl(url).(0).text()).avatar(IMG_URL_API).content(newContent).build();//IMG_URL_API是我文章图片封面的url,需要自己修改baseMapper.insert(entity);//保存文章//为该文章添加标签List<Long> tagsId = new ArrayList<>();tags.forEach(item ->{String tag = ();Tags result = tagsMapper.selectOne(new QueryWrapper<Tags>().eq(SqlConf.NAME,tag ));if (result == null ){result = Tags.builder().name(tag).build();//以页面已有的标签在本地创建一个新的标签tagsMapper.insert(result);}tagsId.Id());});if(tagsId.size()==0 ){//如果csdn写文章时就没有写标签 ---那么就定义一个后期改log.info("文章抓取成功,但是无标签,已使用标题作为标签:{}", JSONString(entity));Tags t = Tags.builder().Title()).build();//以页面已有的标签在本地创建一个新的标签tagsId.Id());}tagsMapper.Id(),tagsId);log.info("文章抓取成功,内容为:{}", JSONString(entity));} catch (IOException e) {throw new BusinessException(e);}return ResponseResult.success();}
文章实体类:
package ity;import batisplus.annotation.*;
import com.fasterxml.jackson.annotation.JsonFormat;
import com.shiyi.util.DateUtils;
import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.*;import java.io.Serializable;
import java.util.Date;/*** <p>* 博客文章表* </p>** @author blue* @since 2021-08-18*/
@Data
@EqualsAndHashCode(callSuper = false)
@TableName("b_article")
@ApiModel(value="BlogArticle对象", description="博客文章表")
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class BlogArticle implements Serializable {private static final long serialVersionUID=1L;@ApiModelProperty(value = "主键id")@TableId(value = "id", type = IdType.AUTO)private Long id;@ApiModelProperty(value = "用户id")private Long userId;@ApiModelProperty(value = "分类id")private Long categoryId;@ApiModelProperty(value = "文章标题")private String title;@ApiModelProperty(value = "文章封面地址")private String avatar;@ApiModelProperty(value = "文章简介")private String summary;@ApiModelProperty(value = "文章内容")private String content;@ApiModelProperty(value = "文章内容MD版")private String contentMd;@ApiModelProperty(value = "发布状态 0:下架;1:上架")private Integer isPublish;@ApiModelProperty(value = "是否是私密文章 0 否 1是")private Integer isSecret;@ApiModelProperty(value = "是否置顶 0否 1是")private Integer isStick;@ApiModelProperty(value = "是否原创 0:转载 1:原创")private Integer isOriginal;@ApiModelProperty(value = "转发地址")private String originalUrl;@ApiModelProperty(value = "文章阅读量")private Integer quantity;@ApiModelProperty(value = "说明")private String remark;@ApiModelProperty(value = "SEO关键词")private String keywords;@ApiModelProperty(value = "创建时间")@TableField(fill = FieldFill.INSERT)@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")private Date createTime;@ApiModelProperty(value = "最后更新时间")@TableField(fill = FieldFill.UPDATE)@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")private Date updateTime;}
标签实体:
package ity;import batisplus.annotation.*;import java.util.Date;import java.io.Serializable;import com.fasterxml.jackson.annotation.JsonFormat;
import com.shiyi.util.DateUtils;
import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.*;/*** <p>* 博客标签表* </p>** @author blue* @since 2021-09-09*/
@Data
@EqualsAndHashCode(callSuper = false)
@TableName("b_tags")
@ApiModel(value="Tags对象", description="博客标签表")
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class Tags implements Serializable {private static final long serialVersionUID=1L;@ApiModelProperty(value = "主键id")@TableId(value = "id", type = IdType.AUTO)private Long id;@ApiModelProperty(value = "标签名称")private String name;@ApiModelProperty(value = "排序")private int sort;@ApiModelProperty(value = "点击量")private int clickVolume;@ApiModelProperty(value = "创建时间")@TableField(fill = FieldFill.INSERT)@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")private Date createTime;@ApiModelProperty(value = "最后更新时间")@TableField(fill = FieldFill.UPDATE)@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")private Date updateTime;@TableField(exist = false)private int articleCount;public Tags(Long id, int clickVolume) {this.id = id;this.clickVolume = clickVolume;}
}
文章实体对应的mysql8.0的脚本:
-- blog.b_article definitionCREATE TABLE `b_article` (`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键id',`user_id` bigint DEFAULT NULL COMMENT '用户id',`category_id` bigint DEFAULT NULL COMMENT '分类id',`title` varchar(150) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '' COMMENT '文章标题',`avatar` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '文章封面地址',`summary` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '' COMMENT '文章简介',`content` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '文章内容 (最多两百字)',`content_md` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '文章内容md版',`is_secret` int DEFAULT '0' COMMENT '是否是私密文章 0 否 1是',`is_stick` int DEFAULT '0' COMMENT '是否置顶 0否 1是',`is_publish` int DEFAULT '0' COMMENT '是否发布 0:下架 1:发布',`is_original` int DEFAULT NULL COMMENT '是否原创 0:转载 1:原创',`original_url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '转载地址',`quantity` bigint DEFAULT '0' COMMENT '文章阅读量',`remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT '' COMMENT '说明',`create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',`keywords` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT 'seo关键词',`update_time` datetime DEFAULT NULL COMMENT '修改时间',PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=429 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC COMMENT='博客文章表';
标签实体mysql8.0脚本:
-- blog.b_tags definitionCREATE TABLE `b_tags` (`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键id',`name` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '' COMMENT '标签名称',`click_volume` int DEFAULT '0',`sort` int NOT NULL COMMENT '排序',`create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',`update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '最后更新时间',PRIMARY KEY (`id`) USING BTREE,KEY `tag_name` (`name`) USING BTREE COMMENT '博客标签名称'
) ENGINE=InnoDB AUTO_INCREMENT=289 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC COMMENT='博客标签表';
文章与标签关联表:
-- blog.b_article_tag definitionCREATE TABLE `b_article_tag` (`id` int NOT NULL AUTO_INCREMENT,`article_id` int NOT NULL COMMENT '文章id',`tag_id` int NOT NULL COMMENT '标签id',PRIMARY KEY (`id`) USING BTREE,UNIQUE KEY `fk_article_tag_1` (`article_id`,`tag_id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1308 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;
本文发布于:2024-01-30 21:07:24,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170662004722849.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |