起因

主要是因为最近在搞爬虫 第二个是因为最近需要用ppt又懒得一套一套的下,所以准备爬一些免费的ppt下来.

准备工作

爬虫的简单使用已经在上一篇博客里讲到了,这次就想来点偏实战的,本来高大佬已经写好了站长之家的爬取方式,直接用java实现就可以了,但是!我发现站长之家不免费了。。。emmm 很难受

所以只能换一个了 于是我找到了优品网 貌似还可以 而且是免费的

那么 按照高大佬的方式 应该看下这些缩略图和下载链接有什么关系没 看了一下 没什么关系。。。靠

举个例子:缩略图:/uploads/allimg/181104/1-1Q104144T30-L.jpg

​ 下载地址:http://www.youpinppt.com/soft/181104/1-1Q104144Q6.rar

居然没什么关系 我直接懵逼了 这怎么办呢 讲道理如果我是开发人员 我应该需要个东西来记录这个ppt 比如给个编号什么的 经过我的观察 我发现在下载地址的时候给了一个 aid 这个aid 就是在缩略图的外层A标签上 所以只要能搞到这个 aid就能下载啦 而且我发现因为这个站点没有做拦截 所以直接爬来下就可以了

但是这个要注意的是5231这个id号有可能是两位数或者三位数的 所以我在后面的字符串里做了处理 还有需要注意的是下载链接 有得链接放的地方是uoload开头的 但是实际上这个是错误地址 正确的下载地址是soft开头(估计是数据移植到另外一个站点)

代码

关于爬虫的基础理论我在上一篇博客已经讲过了 这里就直接贴代码了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
package com.sammie.top.test;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.URISyntaxException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;

public class zzCrawler {
static String s = "";
public static void main(String[] args) throws Exception {
String baseUrl = "http://www.ypppt.com/moban/list-";
ppt p = new ppt();
for (int i = 2; i < 100; i++) {
doCrawler(baseUrl+i+".html",p);
}
}
private static Connection getConn() {
String driver = "com.mysql.cj.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/test?characterEncoding=utf8&useSSL=false&serverTimezone=UTC";
String username = "root";
String password = "123456";
Connection conn = null;
try {
Class.forName(driver); //classLoader,加载对应驱动
try {
conn = (Connection) DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
e.printStackTrace();
}
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
return conn;
}
public static void doCrawler(String CrawlerUrl,ppt p) throws URISyntaxException {
CloseableHttpClient httpClient = HttpClients.createDefault();
URIBuilder url = new URIBuilder(CrawlerUrl);
//如需添加参数 可以使用url.setParameter(key,val)方法
HttpGet httpGet = new HttpGet(url.build());
//发起请求
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String content = EntityUtils.toString(response.getEntity(), "utf8");
Document doc = Jsoup.parse(content);
Elements elements = doc.select(".posts li");
for (Element e : elements) {
String attr = e.select(".p-title").attr("href").substring(14,18);
if(attr.contains(".")){
attr = attr.replace(".", "");
if (attr.contains("h")){
attr = attr.replace("h","");
}
p.setLink(attr);
}
else{
p.setLink(attr);
}
String title = e.select(".p-title").text();
p.setTitle(title);
CloseableHttpClient htp = HttpClients.createDefault();
HttpGet get = new HttpGet("http://www.ypppt.com/p/d.php?aid="+attr);
CloseableHttpResponse execute = htp.execute(get);
if (execute.getStatusLine().getStatusCode()==200){
String con = EntityUtils.toString(execute.getEntity(), "utf8");
Document parse = Jsoup.parse(con);
Element li = parse.select("li").last();
String attr1 = li.select("a").attr("href");
if (attr1.contains("uploads")){
attr1 = "http://www.youpinppt.com"+attr1.replace("uploads/","");
}
p.setDownloadUrl(attr1);
insert(p);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
private static int insert(ppt p) {
Connection conn = getConn();
int i = 0;
String sql = "insert into ppt (title,link,downloadUrl) values(?,?,?)";
PreparedStatement pstmt;
try {
pstmt = (PreparedStatement) conn.prepareStatement(sql);
pstmt.setString(1, p.getTitle());
pstmt.setString(2, p.getLink());
pstmt.setString(3, p.getDownloadUrl());
i = pstmt.executeUpdate();
pstmt.close();
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
return i;
}
}

这里爬取的都是下载链接 然后我单独开了个类去下载 暂时下了500套

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
package com.sammie.top.test;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.net.URL;
import java.sql.*;

public class download {
public static void main(String[] args) {
getAll();
}

private static Integer getAll() {
Connection conn = getConn();
String sql = "select * from ppt limit 500 ";
PreparedStatement pstmt;
try {
pstmt = (PreparedStatement)conn.prepareStatement(sql);
ResultSet rs = pstmt.executeQuery();

while (rs.next()) {
downloadHttpUrl(rs.getString("downloadUrl"),"E:/files/",rs.getString("title")+".rar");
}

} catch (SQLException e) {
e.printStackTrace();
}
return null;
}
public static void downloadHttpUrl(String url, String dir, String fileName) {
try {
URL httpurl = new URL(url);
File dirfile = new File(dir);
if (!dirfile.exists()) {
dirfile.mkdirs();
}
FileUtils.copyURLToFile(httpurl, new File(dir+fileName));
System.out.println(fileName+"下载完成!");
} catch (Exception e) {
e.printStackTrace();
}
}
private static Connection getConn() {
String driver = "com.mysql.cj.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/test?characterEncoding=utf8&useSSL=false&serverTimezone=UTC";
String username = "root";
String password = "123456";
Connection conn = null;
try {
Class.forName(driver); //classLoader,加载对应驱动
try {
conn = (Connection) DriverManager.getConnection(url, username, password);
} catch (SQLException e) {
e.printStackTrace();
}
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
return conn;
}
}

结语

偶偶偶偶凯~