今天学完爬虫之后想的爬一下我们学校的教务系统,可是发现登录的时候有验证码。因此研究了Jsoup爬取带验证码的网站:
大体的思路是:(需要注意的是__VIEWSTATE一直变化,所以我们每个页面都需要重新获取并带着爬取下一个页面)
1.先爬取网站的主页,由于我们学校的网站是ASP,所以需要爬到每个网页的__VIEWSTATE。同时爬取主页也可以获得一个cookie(ASP.sessionId)
2.带着__VIEWSTATE和ASP.sessionId爬取验证码。(网上说有专门识别验证码的软件,在这里我只是把验证码下载到本地之后,需要用户输入验证码)获取验证码图片的时候需要带着cookie去获取,来标识是本次session请求的验证码,如果不带sessionid下载验证码之后输入验证码也无效。
3.输入用户名,密码和验证码登录系统,登录系统需要携带一些其他参数(值为空也需要携带)。
4.登录之后不能直接爬取成绩,需要爬虫登录成功之后的主页面获取__viewstate。
5.爬完登录成功的主页之后就可以进行爬取成绩,将爬到的成绩收集起来,最后输出到html页面中。
(在这个爬虫的过程中需要注意__viewstate,每个页面都需要获取这个值,这个值是放在input隐藏域中。另外爬取过程中请求头携带REFER参数(也就是表示你从哪个网站过来的),防止盗链)
下面是代码:
1.爬虫的入口
package aw.JsoupCrawJWXT;import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import java.util.Scanner;/*** 爬虫主的程序调度器(爬虫教务系统的入口)* * @author liqiang**/ public class MainClass {public static void main(String[] args) {// 输入学号和密码System.out.print("请输入你要查询学号:");Scanner sc = new Scanner(System.in);String xuehao = sc.next();System.out.print("请输入密码:");String password = sc.next();// Console con = sole();// String pswd = new adPassword());// 因为读取的是字符数组,所以需要用newtry {DownloadLoginfo downloadLoginfo = new DownloadLoginfo();LoginClass loginClass = new LoginClass();GradeOutput gradeOutput = new GradeOutput();// 1.访问主页,获取验证码与viewstate LogInfo();// 2.登录 loginClass.login(downloadLoginfo, xuehao, password);for (Entry<String, String> entry : Cookies().entrySet()) {System.out.println("key:" + Key() + ";value" + Value());}CrawGrade crawGrade = new CrawGrade();//3. 爬取成绩的上一个页面 Cookies(), ViewState(), xuehao);List<String> condition = geneQueryCondition();//4.循环分学年爬取成绩for (String xuenian : condition) {String html_content = awGrade(xuenian, "2", Cookies(),// 4.1爬取成绩页面 ViewState(), xuehao);llectGrade(html_content);}//5.输出爬到的数据到html文件中 gradeOutput.outputDatas2Html();} catch (IOException e) {System.out.println("无法连接学校服务器");} catch (Exception e) {e.printStackTrace();}}/*** 构造需要查询的年份和学期* * @return*/public static List<String> geneQueryCondition() {List<String> condition = new ArrayList<String>();condition.add("2014-2015");condition.add("2015-2016");condition.add("2016-2017");condition.add("2017-2018");return condition;}}
2.爬取学校主页获取__VIEWSTATE和cookie
package aw.JsoupCrawJWXT;import java.util.HashMap; import java.util.Map; import java.util.Map.Entry;import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import des.Document; import des.Element;/*** url获取图片并且保存到本地* * @author liqiang**/ public class DownloadLoginfo {/*** 第一次访问获取的cookie(查看发现就返回一个cookie:ASP.NET_SessionId)*/private Map<String, String> cookies = null;/*** __viewstate 教务系统用于验证的信息*/private String viewState = null;public DownloadLoginfo() {kies = new HashMap<String,String>();;this.viewState = "";}/*** 获取登录信息* 主要就是访问一下主页面,获取一个__viewstate与cookie*/public void getLogInfo() throws Exception {String urlLogin = "/";Connection connect = t(urlLogin);// 伪造请求头connect.header("Accept", "application/json, text/javascript, */*; q=0.01").header("Accept-Encoding","gzip, deflate");connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");connect.header("Content-Length", "213").header("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");connect.header("Host", ust.edu").header("Referer", "/");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36").header("X-Requested-With", "XMLHttpRequest");// 请求url获取响应信息Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求// 获取返回的kies = kies();for (Entry<String, String> entry : Set()) {System.out.Key() + "-" + Value());}// 获取响应体String body = res.body();// 调用下面方法获取__ViewState(body);// 获取viewState//调用下载验证码的工具类下载验证码JsoupDoloadPicture.downloadImg(".aspx", cookies);;}/*** 获取viewstate* * @return*/public String getViewState(String htmlContent) {Document document = Jsoup.parse(htmlContent);Element ele = document.select("input[name='__VIEWSTATE']").first();String value = ele.attr("value");// 获取到viewStatethis.viewState = value;return value;}public Map<String, String> getCookies() {return cookies;}public void setCookies(Map<String, String> cookies) {kies = cookies;}public String getViewState() {return viewState;}public void setViewState(String viewState) {this.viewState = viewState;}}
3.带着验证码爬取验证码,并下载到本地
package aw.JsoupCrawJWXT;import java.io.BufferedOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.Map;import org.apachemons.io.FileUtils; import org.jsoup.Connection; import org.jsoup.Jsoup;/*** Jsoup带着cookie下载验证码到本地(必须带着cookie下载验证码,否则下载的验证码无效)* * @author liqiang**/ public class JsoupDoloadPicture {/*** 带着cookie下载验证码图片* * @param url* @param cookies* @throws IOException*/public static void downloadImg(String url, Map<String, String> cookies) throws IOException {// TODO Auto-generated method stubConnection connect = t(url);kies(cookies);// 携带cookies爬取图片connect.timeout(5 * 10000);Connection.Response response = connect.ignoreContentType(true).execute();byte[] img = response.bodyAsBytes();System.out.println(img.length);// 读取文件存储位置String directory = Value("path", "file");savaImage(img, directory, "yzm.png");}/*** 保存图片到本地* @param img* @param filePath* @param fileName*/public static void savaImage(byte[] img, String filePath, String fileName) {BufferedOutputStream bos = null;FileOutputStream fos = null;File file = null;File dir = new File(filePath);try {//判断文件目录是否存在ists() && !dir.isDirectory()){FileUtils.deleteQuietly(dir);}dir.mkdir();file = new File(filePath + "\" + fileName);fos = new FileOutputStream(file);bos = new BufferedOutputStream(fos);bos.write(img);System.out.println("验证码已经下载到:"+filePath);} catch (FileNotFoundException e) {// TODO Auto-generated catch block e.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch block e.printStackTrace();} finally {if (bos != null) {try {bos.close();} catch (IOException e) {// TODO Auto-generated catch block e.printStackTrace();}}if (fos != null) {try {fos.close();} catch (IOException e) {// TODO Auto-generated catch block e.printStackTrace();}}}} }
4.登录类
package aw.JsoupCrawJWXT;import java.util.Map; import java.util.Map.Entry; import java.util.Scanner;import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup;/*** 登录类(访问登录页面获取登录的cookie)* * @author liqiang**/ public class LoginClass {/*** 记录返回的cookie*/private Map<String, String> cookies = null;/*** 模拟登录获取cookie和sessionid* */public void login(DownloadLoginfo downloadLoginfo, String xuehao, String mima) throws Exception {String urlLogin = ".aspx";Connection connect = t(urlLogin);connect.timeout(5 * 100000);// 伪造请求头connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded");connect.header("Host", ust.edu").header("Referer",".aspx?xh=" + xuehao + "&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");// 输入验证码System.out.println("-----------请输入验证码---------");Scanner sc = new Scanner(System.in);String yzm = sc.next();sc.close();// 携带登陆信息connect.data("txtUserName", xuehao).data("__VIEWSTATE", ViewState()).data("TextBox2", mima).data("Textbox1", "").data("RadioButtonList1", "").data("Button1", "").data("lbLanguage", "").data("hidPdrs", "").data("hidsc", "").data("txtSecretCode", yzm);Cookies());// 请求url获取响应信息Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求// 获取返回的kies = kies();for (Entry<String, String> entry : Set()) {System.out.Key() + "-" + Value());}System.out.println("---------获取的登录之后的页面-----------");String body = res.body();// 获取响应体 System.out.println(body);}public Map<String, String> getCookies() {return cookies;}public void setCookies(Map<String, String> cookies) {kies = cookies;}}
5.爬取登录之后的主页和成绩
package aw.JsoupCrawJWXT;import java.io.IOException; import java.util.Map;import org.jsoup.Connection; import org.jsoup.Jsoup; import des.Document; import des.Element;/*** 爬取成绩的类* * @author liqiang**/ public class CrawGrade {private String viewState;/*** 全局获取viewstate的函数* @param html* @return*/public String getViewState(String html){Document document = Jsoup.parse(html);Element ele = document.select("input[name='__VIEWSTATE']").first();String value = ele.attr("value");this.viewState = value;// 获取到viewStatereturn value;}/*** 爬取获取成绩的上一个页面(也就是刚登陆之后的页面)* @param cookies* @param viewStata* @param xuehao* @return* @throws IOException*/public String crawGradeLastPage(Map<String,String> cookies,String viewStata,String xuehao) throws IOException{String urlLogin = ".aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";Connection connect = t(urlLogin);connect.timeout(5 * 100000);// 伪造请求头connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");connect.header("Host", ust.edu").header("Referer", ".aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");// 携带登陆信息connect.data("xh","201420020123").data("xm", viewStata).data("hidLanguage", "").data("gnmkdm", "N121613");//设置cookie kies(cookies);Document document = connect.post();System.out.println("-----------爬到的成绩的上一个页面--------------");String html = String();System.out.println(html);// 重新获取到ViewState(html);return html;}/*** 爬取成绩页面*/public String crawGrade(String xuenian,String xueqi,Map<String,String> cookies,String viewStata,String xuehao) throws IOException{String urlLogin = ".aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";Connection connect = t(urlLogin);connect.timeout(5 * 100000);// 伪造请求头connect.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8").header("Accept-Encoding", "gzip, deflate");connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");connect.header("Host", ust.edu").header("Referer", ".aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");// 携带登陆信息connect.data("__EVENTTARGET","").data("__EVENTARGUMENT", "").data("__VIEWSTATE", this.viewState).data("hidLanguage","").data("ddlXN", xuenian).data("ddlXQ", xueqi).data("btn_xn", "").data("ddl_kcxz", "");kies(cookies);Document document = connect.post();System.out.println("-----------爬到的成绩的页面--------------");String html = String();//更新ViewState(html);System.out.println(html);return html;}public void setViewState(String viewState) {this.viewState = viewState;}}
6.收集成绩的类
package aw.JsoupCrawJWXT;import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map;import org.jsoup.Jsoup; import des.Document; import des.Element; import org.jsoup.select.Elements;/*** 收集成绩与输出成绩* * @author liqiang**/ @SuppressWarnings("all") public class GradeOutput {/*** 保存成绩的集合*/private List<Map<String, Object>> datas;public GradeOutput() {this.datas = new ArrayList<Map<String, Object>>();}/*** 收集成绩* * @param html* @return*/public String collectGrade(String html) {// 解析htmlDocument document = Jsoup.parse(html);// 获取成绩表格Element table = document.select("#Datagrid1").first();// 选择除表格表头之外的元素Elements trs = table.select("tr:gt(0)");for (Element ele : trs) {Map result = new LinkedHashMap();Elements ele0 = ele.select("td:eq(0)");// 找到学年result.put("xuenian", ());Elements ele1 = ele.select("td:eq(1)");// 找到学期result.put("xueqi", ());Elements ele3 = ele.select("td:eq(3)");// 找到课程名称result.put("kecheng", ());Elements ele8 = ele.select("td:eq(8)");// 找到成绩result.put("chengji", ());this.datas.add(result);}return null;}/*** 输出成绩到控制台*/public void outPutGrade() {if (this.datas == null || this.datas.size() == 0) {return;}System.out.println("-------下面是提取到的成绩--------");for (Map result : datas) {System.out.("xuenian") + "t" + ("xueqi") + "t" + ("kecheng") + "t"+ ("chengji") + "t");}}/*** 最后处理所有的数据,写出到html或者保存数据库* * @throws IOException*/public void outputDatas2Html() throws IOException {if (datas != null && datas.size() > 0) {// 读取文件存储位置String directory = Value("path", "file");File file = new File(directory+"\gradeOut.html");// 如果文件不存在就创建文件if (!ists()) {ateNewFile();}// 构造FileWriter用于向文件中输出信息(此构造方法可以接收file参数,也可以接收fileName参数)FileWriter fileWriter = new FileWriter(file);// 开始写入数据fileWriter.write("<html>");fileWriter.write("<head>");fileWriter.write("<title>xxx成绩单</title>");fileWriter.write("<style>table{width:100%;table-layout: fixed;word-break: break-all; word-wrap: break-word;}"+ "table td{border:1px solid black;width:300px}</style>");fileWriter.write("</head>");fileWriter.write("<body>");fileWriter.write("<table cellpadding='0' cellspacing='0' style='text-align:center;'>");fileWriter.write("<tr style='background-color:#95caca;font-size:20px'><td>学年</td><td>学期</td><td>课程名字</td><td>成绩</td></tr>");for (Map<String, Object> data : datas) {String xuenian = (String) ("xuenian");String xueqi = (String) ("xueqi");String kecheng = (String) ("kecheng");String chengji = (String) ("chengji");fileWriter.write("<tr>");fileWriter.write("<td>" + xuenian + "</td>");fileWriter.write("<td>" + xueqi + "</td>");fileWriter.write("<td>" + kecheng + "</td>");fileWriter.write("<td>" + chengji + "</td>");fileWriter.write("</tr>");}fileWriter.write("</table>");fileWriter.write("</body>");fileWriter.write("</html>");// 关闭文件流 fileWriter.close();}}public List<Map<String, Object>> getDatas() {return datas;}public void setDatas(List<Map<String, Object>> datas) {this.datas = datas;}}
path.properties (设置验证码图片和最后的成绩单输出到哪个位置)
#fileToSave #yzm file=C:\Users\liqiang\Desktop
读取上述配置文件的工具类:
package aw.JsoupCrawJWXT;import java.io.Serializable; MessageFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.ResourceBundle; import java.util.Set;/*** 资源文件读取工具类* */ public class ResourcesUtil implements Serializable {private static final long serialVersionUID = -7657898714983901418L;/*** 系统语言环境,默认为中文zh*/public static final String LANGUAGE = "zh";/*** 系统国家环境,默认为中国CN*/public static final String COUNTRY = "CN";private static Locale getLocale() {Locale locale = new Locale(LANGUAGE, COUNTRY);return locale;}/*** 根据语言、国家、资源文件名和key名字获取资源文件值* * @param language* 语言* * @param country* 国家* * @param baseName* 资源文件名* * @param section* key名字* * @return 值*/private static String getProperties(String baseName, String section) {String retValue = "";try {Locale locale = getLocale();ResourceBundle rb = Bundle(baseName, locale);retValue = (String) rb.getObject(section);} catch (Exception e) {e.printStackTrace();// TODO 添加处理 }return retValue;}/*** 通过key从资源文件读取内容* * @param fileName* 资源文件名* * @param key* 索引* * @return 索引对应的内容*/public static String getValue(String fileName, String key) {String value = getProperties(fileName,key);return value;}public static List<String> gekeyList(String baseName) {Locale locale = getLocale();ResourceBundle rb = Bundle(baseName, locale);List<String> reslist = new ArrayList<String>();Set<String> keyset = rb.keySet();for (Iterator<String> it = keyset.iterator(); it.hasNext();) {String lkey = (();reslist.add(lkey);}return reslist;}/*** 通过key从资源文件读取内容,并格式化* * @param fileName* 资源文件名* * @param key* 索引* * @param objs* 格式化参数* * @return 格式化后的内容*/public static String getValue(String fileName, String key, Object[] objs) {String pattern = getValue(fileName, key);String value = MessageFormat.format(pattern, objs);return value;}public static void main(String[] args) {System.out.println(getValue(ssages", "101",new Object[]{100,200}));//根据操作系统环境获取语言环境/*Locale locale = Default();System.out.Country());//输出国家代码System.out.Language());//输出语言代码s//加载国际化资源(classpath下resources目录下的messages.properties,如果是中文环境会优先找messages_zh_CN.properties)ResourceBundle rb = Bundle(ssages", locale);String retValue = rb.getString("101");//101是messages.properties文件中的keySystem.out.println(retValue);//信息格式化,如果资源中有{}的参数则需要使用MessageFormat格式化,Object[]为传递的参数,数量根据资源文件中的{}个数决定String value = MessageFormat.format(retValue, new Object[]{100,200});System.out.println(value); */} }
git地址:
转载于:.html
本文发布于:2024-01-29 10:56:52,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170649701614792.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |