Jsoup爬取带登录验证码的网站

阅读：评论：0

Jsoup爬取带登录验证码的网站

　　今天学完爬虫之后想的爬一下我们学校的教务系统，可是发现登录的时候有验证码。因此研究了Jsoup爬取带验证码的网站:

大体的思路是:(需要注意的是__VIEWSTATE一直变化，所以我们每个页面都需要重新获取并带着爬取下一个页面)

　　1.先爬取网站的主页，由于我们学校的网站是ASP，所以需要爬到每个网页的__VIEWSTATE。同时爬取主页也可以获得一个cookie(ASP.sessionId)

　　2.带着__VIEWSTATE和ASP.sessionId爬取验证码。(网上说有专门识别验证码的软件，在这里我只是把验证码下载到本地之后，需要用户输入验证码)获取验证码图片的时候需要带着cookie去获取，来标识是本次session请求的验证码，如果不带sessionid下载验证码之后输入验证码也无效。

　　3.输入用户名，密码和验证码登录系统，登录系统需要携带一些其他参数(值为空也需要携带)。

　　4.登录之后不能直接爬取成绩，需要爬虫登录成功之后的主页面获取__viewstate。

　　5.爬完登录成功的主页之后就可以进行爬取成绩，将爬到的成绩收集起来，最后输出到html页面中。

(在这个爬虫的过程中需要注意__viewstate，每个页面都需要获取这个值，这个值是放在input隐藏域中。另外爬取过程中请求头携带REFER参数(也就是表示你从哪个网站过来的)，防止盗链)

下面是代码:

1.爬虫的入口

package aw.JsoupCrawJWXT;import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Scanner;/*** 爬虫主的程序调度器(爬虫教务系统的入口)* * @author liqiang**/
public class MainClass {public static void main(String[] args) {// 输入学号和密码System.out.print("请输入你要查询学号:");Scanner sc = new Scanner(System.in);String xuehao = sc.next();System.out.print("请输入密码:");String password = sc.next();// Console con = sole();// String pswd = new adPassword());// 因为读取的是字符数组,所以需要用newtry {DownloadLoginfo downloadLoginfo = new DownloadLoginfo();LoginClass loginClass = new LoginClass();GradeOutput gradeOutput = new GradeOutput();// 1.访问主页，获取验证码与viewstate
            LogInfo();// 2.登录
            loginClass.login(downloadLoginfo, xuehao, password);for (Entry<String, String> entry : Cookies().entrySet()) {System.out.println("key:" + Key() + ";value" + Value());}CrawGrade crawGrade = new CrawGrade();//3. 爬取成绩的上一个页面
            Cookies(), ViewState(), xuehao);List<String> condition = geneQueryCondition();//4.循环分学年爬取成绩for (String xuenian : condition) {String html_content = awGrade(xuenian, "2", Cookies(),// 4.1爬取成绩页面
                        ViewState(), xuehao);llectGrade(html_content);}//5.输出爬到的数据到html文件中
            gradeOutput.outputDatas2Html();} catch (IOException e) {System.out.println("无法连接学校服务器");} catch (Exception e) {e.printStackTrace();}}/*** 构造需要查询的年份和学期* * @return*/public static List<String> geneQueryCondition() {List<String> condition = new ArrayList<String>();condition.add("2014-2015");condition.add("2015-2016");condition.add("2016-2017");condition.add("2017-2018");return condition;}}

2.爬取学校主页获取__VIEWSTATE和cookie

package aw.JsoupCrawJWXT;import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import des.Document;
import des.Element;/*** url获取图片并且保存到本地* * @author liqiang**/
public class DownloadLoginfo {/*** 第一次访问获取的cookie(查看发现就返回一个cookie:ASP.NET_SessionId)*/private  Map<String, String> cookies = null;/*** __viewstate    教务系统用于验证的信息*/private  String viewState = null;public DownloadLoginfo() {kies = new HashMap<String,String>();;this.viewState = "";}/*** 获取登录信息* 主要就是访问一下主页面，获取一个__viewstate与cookie*/public  void getLogInfo() throws Exception {String urlLogin = "/";Connection connect = t(urlLogin);// 伪造请求头connect.header("Accept", "application/json, text/javascript, */*; q=0.01").header("Accept-Encoding","gzip, deflate");connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");connect.header("Content-Length", "213").header("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");connect.header("Host", &#ust.edu").header("Referer", "/");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36").header("X-Requested-With", "XMLHttpRequest");// 请求url获取响应信息Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求// 获取返回的kies = kies();for (Entry<String, String> entry : Set()) {System.out.Key() + "-" + Value());}// 获取响应体String body = res.body();// 调用下面方法获取__ViewState(body);// 获取viewState//调用下载验证码的工具类下载验证码JsoupDoloadPicture.downloadImg(".aspx", cookies);;}/*** 获取viewstate* * @return*/public  String getViewState(String htmlContent) {Document document = Jsoup.parse(htmlContent);Element ele = document.select("input[name='__VIEWSTATE']").first();String value = ele.attr("value");// 获取到viewStatethis.viewState = value;return value;}public Map<String, String> getCookies() {return cookies;}public void setCookies(Map<String, String> cookies) {kies = cookies;}public String getViewState() {return viewState;}public void setViewState(String viewState) {this.viewState = viewState;}}

3.带着验证码爬取验证码，并下载到本地

package aw.JsoupCrawJWXT;import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Map;import org.apachemons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;/*** Jsoup带着cookie下载验证码到本地(必须带着cookie下载验证码，否则下载的验证码无效)* * @author liqiang**/
public class JsoupDoloadPicture {/*** 带着cookie下载验证码图片* * @param url* @param cookies* @throws IOException*/public static void downloadImg(String url, Map<String, String> cookies) throws IOException {// TODO Auto-generated method stubConnection connect = t(url);kies(cookies);// 携带cookies爬取图片connect.timeout(5 * 10000);Connection.Response response = connect.ignoreContentType(true).execute();byte[] img = response.bodyAsBytes();System.out.println(img.length);// 读取文件存储位置String directory = Value("path", "file");savaImage(img, directory, "yzm.png");}/*** 保存图片到本地* @param img* @param filePath* @param fileName*/public static void savaImage(byte[] img, String filePath, String fileName) {BufferedOutputStream bos = null;FileOutputStream fos = null;File file = null;File dir = new File(filePath);try {//判断文件目录是否存在ists() && !dir.isDirectory()){FileUtils.deleteQuietly(dir);}dir.mkdir();file = new File(filePath + "\" + fileName);fos = new FileOutputStream(file);bos = new BufferedOutputStream(fos);bos.write(img);System.out.println("验证码已经下载到:"+filePath);} catch (FileNotFoundException e) {// TODO Auto-generated catch block
            e.printStackTrace();} catch (IOException e) {// TODO Auto-generated catch block
            e.printStackTrace();} finally {if (bos != null) {try {bos.close();} catch (IOException e) {// TODO Auto-generated catch block
                    e.printStackTrace();}}if (fos != null) {try {fos.close();} catch (IOException e) {// TODO Auto-generated catch block
                    e.printStackTrace();}}}}
}

4.登录类

package aw.JsoupCrawJWXT;import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;/*** 登录类(访问登录页面获取登录的cookie)* * @author liqiang**/
public class LoginClass {/*** 记录返回的cookie*/private Map<String, String> cookies = null;/*** 模拟登录获取cookie和sessionid* */public void login(DownloadLoginfo downloadLoginfo, String xuehao, String mima) throws Exception {String urlLogin = ".aspx";Connection connect = t(urlLogin);connect.timeout(5 * 100000);// 伪造请求头connect.header("Content-Length", "213").header("Content-Type", "application/x-www-form-urlencoded");connect.header("Host", &#ust.edu").header("Referer",".aspx?xh=" + xuehao + "&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");// 输入验证码System.out.println("-----------请输入验证码---------");Scanner sc = new Scanner(System.in);String yzm = sc.next();sc.close();// 携带登陆信息connect.data("txtUserName", xuehao).data("__VIEWSTATE", ViewState()).data("TextBox2", mima).data("Textbox1", "").data("RadioButtonList1", "").data("Button1", "").data("lbLanguage", "").data("hidPdrs", "").data("hidsc", "").data("txtSecretCode", yzm);Cookies());// 请求url获取响应信息Response res = connect.ignoreContentType(true).method(Method.POST).execute();// 执行请求// 获取返回的kies = kies();for (Entry<String, String> entry : Set()) {System.out.Key() + "-" + Value());}System.out.println("---------获取的登录之后的页面-----------");String body = res.body();// 获取响应体
        System.out.println(body);}public Map<String, String> getCookies() {return cookies;}public void setCookies(Map<String, String> cookies) {kies = cookies;}}

5.爬取登录之后的主页和成绩

package aw.JsoupCrawJWXT;import java.io.IOException;
import java.util.Map;import org.jsoup.Connection;
import org.jsoup.Jsoup;
import des.Document;
import des.Element;/*** 爬取成绩的类* * @author liqiang**/
public class CrawGrade {private String viewState;/*** 全局获取viewstate的函数* @param html* @return*/public  String getViewState(String html){Document document = Jsoup.parse(html);Element ele = document.select("input[name='__VIEWSTATE']").first();String value = ele.attr("value");this.viewState = value;// 获取到viewStatereturn value;}/*** 爬取获取成绩的上一个页面(也就是刚登陆之后的页面)* @param cookies* @param viewStata* @param xuehao* @return* @throws IOException*/public String crawGradeLastPage(Map<String,String> cookies,String viewStata,String xuehao) throws IOException{String urlLogin = ".aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";Connection connect = t(urlLogin);connect.timeout(5 * 100000);// 伪造请求头connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");connect.header("Host", &#ust.edu").header("Referer", ".aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");// 携带登陆信息connect.data("xh","201420020123").data("xm", viewStata).data("hidLanguage", "").data("gnmkdm", "N121613");//设置cookie
        kies(cookies);Document document = connect.post();System.out.println("-----------爬到的成绩的上一个页面--------------");String html = String();System.out.println(html);// 重新获取到ViewState(html);return html;}/*** 爬取成绩页面*/public String crawGrade(String xuenian,String xueqi,Map<String,String> cookies,String viewStata,String xuehao) throws IOException{String urlLogin = ".aspx?xh="+xuehao+"&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613";Connection connect = t(urlLogin);connect.timeout(5 * 100000);// 伪造请求头connect.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8").header("Accept-Encoding", "gzip, deflate");connect.header("Accept-Language", "zh-CN,zh;q=0.9").header("Connection", "keep-alive");connect.header("Content-Length", "74642").header("Content-Type", "application/x-www-form-urlencoded");connect.header("Host", &#ust.edu").header("Referer", ".aspx?xh=201420020123&xm=%C7%C7%C0%FB%C7%BF&gnmkdm=N121613");connect.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");// 携带登陆信息connect.data("__EVENTTARGET","").data("__EVENTARGUMENT", "").data("__VIEWSTATE", this.viewState).data("hidLanguage","").data("ddlXN", xuenian).data("ddlXQ", xueqi).data("btn_xn", "").data("ddl_kcxz", "");kies(cookies);Document document = connect.post();System.out.println("-----------爬到的成绩的页面--------------");String html = String();//更新ViewState(html);System.out.println(html);return html;}public void setViewState(String viewState) {this.viewState = viewState;}}

6.收集成绩的类

package aw.JsoupCrawJWXT;import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;import org.jsoup.Jsoup;
import des.Document;
import des.Element;
import org.jsoup.select.Elements;/*** 收集成绩与输出成绩* * @author liqiang**/
@SuppressWarnings("all")
public class GradeOutput {/*** 保存成绩的集合*/private List<Map<String, Object>> datas;public GradeOutput() {this.datas = new ArrayList<Map<String, Object>>();}/*** 收集成绩* * @param html* @return*/public String collectGrade(String html) {// 解析htmlDocument document = Jsoup.parse(html);// 获取成绩表格Element table = document.select("#Datagrid1").first();// 选择除表格表头之外的元素Elements trs = table.select("tr:gt(0)");for (Element ele : trs) {Map result = new LinkedHashMap();Elements ele0 = ele.select("td:eq(0)");// 找到学年result.put("xuenian", ());Elements ele1 = ele.select("td:eq(1)");// 找到学期result.put("xueqi", ());Elements ele3 = ele.select("td:eq(3)");// 找到课程名称result.put("kecheng", ());Elements ele8 = ele.select("td:eq(8)");// 找到成绩result.put("chengji", ());this.datas.add(result);}return null;}/*** 输出成绩到控制台*/public void outPutGrade() {if (this.datas == null || this.datas.size() == 0) {return;}System.out.println("-------下面是提取到的成绩--------");for (Map result : datas) {System.out.("xuenian") + "t" + ("xueqi") + "t" + ("kecheng") + "t"+ ("chengji") + "t");}}/*** 最后处理所有的数据，写出到html或者保存数据库* * @throws IOException*/public void outputDatas2Html() throws IOException {if (datas != null && datas.size() > 0) {// 读取文件存储位置String directory = Value("path", "file");File file = new File(directory+"\gradeOut.html");// 如果文件不存在就创建文件if (!ists()) {ateNewFile();}// 构造FileWriter用于向文件中输出信息(此构造方法可以接收file参数，也可以接收fileName参数)FileWriter fileWriter = new FileWriter(file);// 开始写入数据fileWriter.write("<html>");fileWriter.write("<head>");fileWriter.write("<title>xxx成绩单</title>");fileWriter.write("<style>table{width:100%;table-layout: fixed;word-break: break-all; word-wrap: break-word;}"+ "table td{border:1px solid black;width:300px}</style>");fileWriter.write("</head>");fileWriter.write("<body>");fileWriter.write("<table cellpadding='0' cellspacing='0' style='text-align:center;'>");fileWriter.write("<tr style='background-color:#95caca;font-size:20px'><td>学年</td><td>学期</td><td>课程名字</td><td>成绩</td></tr>");for (Map<String, Object> data : datas) {String xuenian = (String) ("xuenian");String xueqi = (String) ("xueqi");String kecheng = (String) ("kecheng");String chengji = (String) ("chengji");fileWriter.write("<tr>");fileWriter.write("<td>" + xuenian + "</td>");fileWriter.write("<td>" + xueqi + "</td>");fileWriter.write("<td>" + kecheng + "</td>");fileWriter.write("<td>" + chengji + "</td>");fileWriter.write("</tr>");}fileWriter.write("</table>");fileWriter.write("</body>");fileWriter.write("</html>");// 关闭文件流
            fileWriter.close();}}public List<Map<String, Object>> getDatas() {return datas;}public void setDatas(List<Map<String, Object>> datas) {this.datas = datas;}}

path.properties (设置验证码图片和最后的成绩单输出到哪个位置)

#fileToSave
#yzm
file=C:\Users\liqiang\Desktop

读取上述配置文件的工具类:

package aw.JsoupCrawJWXT;import java.io.Serializable;
MessageFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.ResourceBundle;
import java.util.Set;/*** 资源文件读取工具类* */
public class ResourcesUtil implements Serializable {private static final long serialVersionUID = -7657898714983901418L;/*** 系统语言环境，默认为中文zh*/public static final String LANGUAGE = "zh";/*** 系统国家环境，默认为中国CN*/public static final String COUNTRY = "CN";private static Locale getLocale() {Locale locale = new Locale(LANGUAGE, COUNTRY);return locale;}/*** 根据语言、国家、资源文件名和key名字获取资源文件值* * @param language*            语言* * @param country*            国家* * @param baseName*            资源文件名* * @param section*            key名字* * @return 值*/private static String getProperties(String baseName, String section) {String retValue = "";try {Locale locale = getLocale();ResourceBundle rb = Bundle(baseName, locale);retValue = (String) rb.getObject(section);} catch (Exception e) {e.printStackTrace();// TODO 添加处理
        }return retValue;}/*** 通过key从资源文件读取内容* * @param fileName*            资源文件名* * @param key*            索引* * @return 索引对应的内容*/public static String getValue(String fileName, String key) {String value = getProperties(fileName,key);return value;}public static List<String> gekeyList(String baseName) {Locale locale = getLocale();ResourceBundle rb = Bundle(baseName, locale);List<String> reslist = new ArrayList<String>();Set<String> keyset = rb.keySet();for (Iterator<String> it = keyset.iterator(); it.hasNext();) {String lkey = (();reslist.add(lkey);}return reslist;}/*** 通过key从资源文件读取内容，并格式化* * @param fileName*            资源文件名* * @param key*            索引* * @param objs*            格式化参数* * @return 格式化后的内容*/public static String getValue(String fileName, String key, Object[] objs) {String pattern = getValue(fileName, key);String value = MessageFormat.format(pattern, objs);return value;}public static void main(String[] args) {System.out.println(getValue(&#ssages", "101",new Object[]{100,200}));//根据操作系统环境获取语言环境/*Locale locale = Default();System.out.Country());//输出国家代码System.out.Language());//输出语言代码s//加载国际化资源（classpath下resources目录下的messages.properties，如果是中文环境会优先找messages_zh_CN.properties）ResourceBundle rb = Bundle(&#ssages", locale);String retValue = rb.getString("101");//101是messages.properties文件中的keySystem.out.println(retValue);//信息格式化，如果资源中有{}的参数则需要使用MessageFormat格式化，Object[]为传递的参数，数量根据资源文件中的{}个数决定String value = MessageFormat.format(retValue, new Object[]{100,200});System.out.println(value);
*/}
}

git地址:

转载于:.html

本文发布于:2024-01-29 10:56:52，感谢您对本站的认可！

本文链接：https://www.4u4v.net/it/170649701614792.html

上一篇：java建造者模式简化

下一篇：Java数据结构和集合源码