本文使用Jsoup采集拉勾网招聘数据并写入CSV文件中,实现非常简单,在此不做多余的解释,如有问题可留言交流。
数据模型:job.java
package xyz.baal.jsoup;public class Job {private String jobname;//职位名称private String salary;//薪水private String place;//工作地点private String experience;//工作经验private String educational;//学历private String business;//业务private String stage;//发展阶段private String company;//公司名称public Job() {super();}public Job(String jobname, String salary, String place, String experience, String educational, String business,String stage, String company) {super();this.jobname = jobname;this.salary = salary;this.place = perience = experience;this.educational = educational;this.business = business;this.stage = stage;thispany = company;}public String getJobname() {return jobname;}public void setJobname(String jobname) {this.jobname = jobname;}public String getSalary() {return salary;}public void setSalary(String salary) {this.salary = salary;}public String getPlace() {return place;}public void setPlace(String place) {this.place = place;}public String getExperience() {return experience;}public void setExperience(String experience) {perience = experience;}public String getEducational() {return educational;}public void setEducational(String educational) {this.educational = educational;}public String getBusiness() {return business;}public void setBusiness(String business) {this.business = business;}public String getStage() {return stage;}public void setStage(String stage) {this.stage = stage;}public String getCompany() {return company;}public void setCompany(String company) {thispany = company;}@Overridepublic String toString() {return "Job [jobname=" + jobname + ", salary=" + salary + ", place=" + place + ", experience=" + experience+ ", educational=" + educational + ", business=" + business + ", stage=" + stage + ", company="+ company + "]";}
}
写入CSV使用的是CSVReader,由于源文件中就CsvReader、CsvWriter两个文件,在这里直接引用了CsvWriter源文件(附:API文档)。
获取各个招聘职位首页链接,如java招聘链接为//www.lagou/zhaopin/Java/
package xyz.baal.jsoup;import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;import org.jsoup.Jsoup;
import des.Document;
import des.Element;
import org.jsoup.select.Elements;/*** 根据拉勾网首页获取各个招聘职位首页链接* * @author **/
public class GetZPURL {private List<String> zpURLlist = new ArrayList<String>();//存放各个招聘职位首页链接public GetZPURL(){super();}/*** 网络加载html文档* @param url 文档url*/public void loadInternet(String url) {Document doc = null;try {doc = t(url).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36").timeout(5000).get();} catch (IOException e) {System.out.println("获取招聘URL失败。");return;}Element content = ElementById("container");Elements links = ElementsByTag("a");for (Element link : links) {String linkHref = link.attr("href");if(isZp(linkHref)){zpURLlist.add(linkHref);}}}/*** 从本地加载html文档* @param path 文档路径* @param charset 文档字符集* @param baseURL 基本url,当链接中存在相对路径时作为前缀* @throws IOException 文件不存在或无法读取时抛出此异常*/public void loadLocal(String path ,String charset, String baseURL) throws IOException {File input = new File(path);Document doc = Jsoup.parse(input, charset, baseURL);Element content = ElementById("container");Elements links = ElementsByTag("a");for (Element link : links) {String linkHref = link.attr("href");if(isZp(linkHref)){zpURLlist.add(linkHref);}}}public boolean isZp(String url){if(url.indexOf("//www.lagou/zhaopin/")!=-1&&url.length()>24){return true;}else {return false;}}public List<String> getZpURLlist() {return zpURLlist;}
}
获取某一招聘职位的30x15条数据,并写入CSV文件。
package xyz.baal.jsoup;import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import Matcher;
import Pattern;import org.jsoup.Jsoup;
import des.Document;
import des.Element;
import org.jsoup.select.Elements;import com.csvreader.CsvWriter;/*** 获取拉勾网某一个职位的30x15条招聘信息* * @author* */
public class GetJob implements Runnable{private String zpUrl;// 某招聘职位对应的原始URLprivate List<String> zpUrlList = new ArrayList<String>(); // 每个分页对应的URLprivate List<String> jobUrlList = new ArrayList<String>();// 每条招聘信息对应的URLprivate List<Job> joblist = new ArrayList<Job>();// 存放30x15条招聘信息private static final String A_HREF = "//www.lagou/jobs/\d+.html"; // href格式 //www.lagou/jobs/2350451.htmlprivate static final String PATH = "D:/"; // 文件存放路径private String jobName = "";//招聘职位名称/*** * @param url 招聘职位首页url,如java、hadoop等招聘职位*/public GetJob(String url) {zpUrl = url;}/*** 在此方法内完成某一招聘职位的450条数据抓取*/public void init() {// 构建30个分页URLzpUrlList.add(zpUrl + "?filterOption=3");for (int i = 2; i <= 30; i++) {zpUrlList.add(zpUrl + i + "/?filterOption=3");}// 提取每个分页中的招聘信息URLfor (String string : zpUrlList) {Document doc = null;try {doc = t("http:" + string).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36").timeout(5000).get();} catch (IOException e) {continue;}Element content = ElementById("s_position_list");if (content == null) {continue;}Elements links = ElementsByTag("a");if (links == null) {continue;}for (Element link : links) {String linkHref = link.attr("href");Pattern pattern = Patternpile(A_HREF, Pattern.CASE_INSENSITIVE);Matcher matcher = pattern.matcher(linkHref);if (matcher.find()) {jobUrlList.add("http:" + linkHref);}}if (jobName == "") {jobName = doc.select("title").first().text().split("-")[0];}}// 根据招聘信息URL提取招聘详细信息for (String string : jobUrlList) {Job job = new Job();Document doc = null;try {doc = t(string).userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.101 Safari/537.36").timeout(5000).get();job.setJobname(jobName);Element content = ElementById("container");Element job_request = content.select(".job_request p").first();if (job_request != null) {if (job_request.child(0) != null) {job.setSalary(job_request.child(0).text());job.setPlace(job_request.child(1).text());job.setExperience(job_request.child(2).text());job.setEducational(job_request.child(3).text());} else {continue;}} else {continue;}Element cpy = ElementById("job_company");if (cpy.childNodeSize()>=2) {job.setCompany(cpy.child(0).child(0).child(0).attr("alt"));job.setBusiness(cpy.child(1).child(0).child(0).ownText());job.setStage(cpy.child(1).child(2).child(0).ownText());} else {continue;}joblist.add(job);} catch (IOException e) {continue;}}}public List<Job> getJoblist() {return joblist;}/*** 将采集数据写入txt文件中*/public void writeTxtFile() {if (joblist.size() == 0 || joblist == null) {return;}File file = new File(PATH + (0).getJobname() + ".txt");FileWriter fw = null;BufferedWriter bw = null;Iterator<Job> iter = joblist.iterator();try {fw = new FileWriter(file);bw = new BufferedWriter(fw);while (iter.hasNext()) {bw.().toString());bw.newLine();}bw.flush();} catch (Exception e) {e.printStackTrace();} finally {try {if (bw != null) {bw.close();}if (fw != null) {fw.close();}} catch (Exception e) {e.printStackTrace();}}}/*** 将采集数据写入CSV文件中*/public void writeCSVFile() {CsvWriter wr = null;if (joblist.size() == 0 || joblist == null) {return;}try {String csvFilePath = PATH + (0).getJobname() + ".csv";wr = new CsvWriter(csvFilePath, ',', Charset.forName("GBK"));String[] header = { "职位名称", "薪水", "工作地点", "工作经验", "学历", "公司名称", "公司业务", "发展阶段"};wr.writeRecord(header);for (Job job : joblist) {String[] jobstr = { Jobname(), Salary(), Place(), Experience(),Educational(), Company(), Business(), Stage() };wr.writeRecord(jobstr);}} catch (IOException e) {e.printStackTrace();} finally {if (wr != null) {wr.close();}}}@Overridepublic void run() {init();writeCSVFile();System.out.println(jobName+"--End");}
}
采集测试:
package xyz.baal.jsoup;import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import urrent.ArrayBlockingQueue;
import urrent.BlockingQueue;
import urrent.ThreadPoolExecutor;
import urrent.TimeUnit;public class Test {public static List<String> zpURLlist = new ArrayList<String>();public static void main(String[] args) throws IOException {//创建等待任务队列 BlockingQueue<Runnable> bqueue = new ArrayBlockingQueue<Runnable>(20); //创建线程池,池中保存的线程数为3,池中允许的最大线程数为4 ThreadPoolExecutor pool = new ThreadPoolExecutor(3,4,50,TimeUnit.MILLISECONDS,bqueue); Runnable job1 = new GetJob("//www.lagou/zhaopin/iOS/");Runnable job2 = new GetJob("//www.lagou/zhaopin/C/");Runnable job3 = new GetJob("//www.lagou/zhaopin/C++/");Runnable job4 = new GetJob("//www.lagou/zhaopin/Python/");Runnable job5 = new GetJob("//www.lagou/zhaopin/HTML5/");Runnable job6 = new GetJob("//www.lagou/zhaopin/webqianduan/");ute(job1);ute(job2);ute(job3);ute(job4);ute(job5);ute(job6);//关闭线程池pool.shutdown();}
}
如需IP代理可在此网站寻找代理资源:/
GitHub:点这里
本文发布于:2024-01-29 02:34:00,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170646684512074.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |