国产一区二区三区四区,www.亚洲区,男人av在线播放

反復(fù)給網(wǎng)站編寫不同的爬蟲邏輯太麻煩了,自己實(shí)現(xiàn)了一個(gè)小框架

可以自定義的部分有:

請求方式(默認(rèn)為Getuser-agent為谷歌瀏覽器的設(shè)置),可以通過實(shí)現(xiàn)RequestSet接口來自定義請求方式

儲(chǔ)存方式(默認(rèn)儲(chǔ)存在f盤的html文件夾下),可以通過SaveUtil接口來自定義保存方式

需要保存的資源(默認(rèn)為整個(gè)html頁面)

篩選方式(默認(rèn)所有url都符合要求),通過實(shí)現(xiàn)ResourseChooser接口來自定義需要保存的url和資源頁面

實(shí)現(xiàn)的部分有:

html頁面的下載方式,通過HttpClient實(shí)現(xiàn)html頁面的下載

html頁面的解析部分,通過jsoup實(shí)現(xiàn)html頁面的解析

HtmlDownloader類,用于根據(jù)一個(gè)url下載一個(gè)html頁面

java" id="highlighter_524259">

									package DownloadPackage;

									import java.io.BufferedReader;

									import java.io.IOException;

									import java.io.InputStreamReader;

									import org.apache.http.HttpEntity;

									import org.apache.http.HttpResponse;

									import org.apache.http.impl.client.CloseableHttpClient;

									import org.apache.http.impl.client.HttpClients;

									/*

									 * 根據(jù)一個(gè)url下載一個(gè)html頁面

									 */

									public class HtmlDownloader {

									    RequestSet requestset = null;

									    public HtmlDownloader(RequestSet requestset){

									        this.requestset = requestset;

									    }

									    public String downloadhtml(String url){

									        String html = null;

									        //創(chuàng)建一個(gè)客戶端

									        //創(chuàng)建一個(gè)讀取流從entity讀取html

									        BufferedReader reader = null;

									        CloseableHttpClient httpclient = HttpClients.createDefault();

									        HttpResponse response = null;

									        try {

									            response = httpclient.execute(requestset.getMethod(url));

									            HttpEntity entity = response.getEntity();

									            reader = new BufferedReader(new InputStreamReader(entity.getContent()));

									            StringBuilder sb = new StringBuilder();

									            while((html = reader.readLine()) != null){

									                sb.append(html);

									            }

									            html = sb.toString();

									            System.out.println("一個(gè)html頁面獲取成功");

									        }

									        catch (IOException e) {

									            System.out.println(url+"連接失敗");

									        }

									        finally{

									            if(reader != null){

									                try {

									                    reader.close();

									                    httpclient.close();

									                }

									                catch (IOException e) {

									                    // TODO Auto-generated catch block

									                    e.printStackTrace();

									                }

									            }

									        }

									        return html;

									    }

									}

UrlGet類,用于根據(jù)一個(gè)html頁面獲得所有的url連接

									package DownloadPackage;

									import java.util.LinkedList;

									import org.jsoup.Jsoup;

									import org.jsoup.nodes.Document;

									import org.jsoup.nodes.Element;

									import org.jsoup.select.Elements;

									public class UrlGet {

									    public LinkedList<String> geturls(String html){

									        LinkedList<String> urls = new LinkedList<String>();

									        Document doc = Jsoup.parse(html);

									        Elements links = doc.getElementsByTag("a");

									        for (Element link:links){

									            String url = link.attr("href");

									            urls.add(url);

									        }

									        return urls;

									    }

									}

資源選擇接口,需要實(shí)現(xiàn)三個(gè)方法,第一是isNeed方法,判斷url是否為需要的,第二個(gè)是isResourse方法,判斷url頁面是不是需要的資源頁面,第三個(gè)是process方法,

有時(shí)網(wǎng)頁上的url是我們需要的但是格式不對,對url進(jìn)行加工

									package ChoosePackage;

									public interface ResourseChooser {

									    public Boolean isNeed(String url);

									    public Boolean isResourse(String url);

									    public String process(String url);

									}

RequsetSet類,用于自定義請求方法的接口,實(shí)現(xiàn)getMethod方法獲取請求方法

									package DownloadPackage;

									import org.apache.http.client.methods.HttpGet;

									/*

									 * 一個(gè)用于獲得Request請求的接口

									 * 實(shí)現(xiàn)getMethod方法獲取Get方法

									 */

									public interface RequestSet {

									    public HttpGet getMethod(String url);

									}

									Saveutil接口用于自定義保存方式,需要實(shí)現(xiàn)save方法

									package SaveUtil;

									/*

									 * 數(shù)據(jù)儲(chǔ)存的工具接口,必須實(shí)現(xiàn)保存方法

									 */

									public interface SaveUtil {

									    public void save(String url,String html);

									}

Spider類,有五中構(gòu)造方法,可以實(shí)現(xiàn)多種自定義操作,其中實(shí)現(xiàn)了上述自定義接口的默認(rèn)實(shí)現(xiàn)類

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

									package Spider;

									import java.io.BufferedWriter;

									import java.io.File;

									import java.io.FileWriter;

									import java.io.IOException;

									import java.util.HashSet;

									import java.util.Iterator;

									import java.util.LinkedList;

									import org.apache.http.client.config.RequestConfig;

									import org.apache.http.client.methods.HttpGet;

									import ChoosePackage.MyResourseChooser;

									import ChoosePackage.ResourseChooser;

									import DownloadPackage.HtmlDownloader;

									import DownloadPackage.RequestSet;

									import DownloadPackage.UrlGet;

									import SaveUtil.MySaveUtil;

									import SaveUtil.SaveUtil;

									/*

									 * 用于爬取資源的類

									 */

									public class Spider{

									    public static void main(String[] args) {

									        new Spider("http://www.bilibili.net").spiderstart();

									    }

									    //種子url

									    String seed = null;

									    //用于保存數(shù)據(jù)的類,需要自己實(shí)現(xiàn)

									    private SaveUtil saveutil = null;

									    //html下載類

									    private HtmlDownloader downloader = null;

									    //url下載類

									    private UrlGet urldownloader = null;

									    //資源選擇工具

									    private ResourseChooser resoursechooser = null;

									    //用于保存未下載的網(wǎng)頁

									    LinkedList<String> unvisited = new LinkedList<String>();

									    //用于保存已下載的網(wǎng)頁

									    HashSet<String> visited = new HashSet<String>();

									    //自定義儲(chǔ)存方式,請求方式,資源篩選方式的構(gòu)造方法

									    public Spider(SaveUtil saveutil,RequestSet request,ResourseChooser resoursechooser,String seed){

									        this.saveutil = saveutil;

									        this.downloader = new HtmlDownloader(request);

									        this.urldownloader = new UrlGet();

									        this.resoursechooser = resoursechooser;

									        this.seed = seed;

									        unvisited.add(seed);

									    }

									    //自定義儲(chǔ)存方式,資源篩選方式的構(gòu)造方法

									    public Spider(SaveUtil saveutil,ResourseChooser resoursechooser,String seed){

									        this.resoursechooser = resoursechooser;

									        this.downloader = new HtmlDownloader(new getRequest());

									        this.saveutil = saveutil;

									        this.urldownloader = new UrlGet();

									        this.seed = seed;

									        unvisited.add(seed);

									    }

									    //自定義儲(chǔ)存方式,請求的構(gòu)造方法

									    public Spider(SaveUtil saveutil,RequestSet requestset,String seed){

									        this.saveutil = saveutil;

									        this.downloader = new HtmlDownloader(requestset);

									        this.resoursechooser = new MyResourseChooser();

									        this.urldownloader = new UrlGet();

									        this.seed = seed;

									        unvisited.add(seed);

									    }

									    //自定義儲(chǔ)存方式的構(gòu)造方法

									    public Spider(SaveUtil saveutil,String seed){

									        this.saveutil = saveutil;

									        this.downloader = new HtmlDownloader(new getRequest());

									        this.resoursechooser = (new MyResourseChooser());

									        this.urldownloader = new UrlGet();

									        this.seed = seed;

									        unvisited.add(seed);

									    }

									    //默認(rèn)的爬蟲構(gòu)造方法

									    public Spider(String seed){

									        this.saveutil = new MySaveUtil();

									        this.downloader = new HtmlDownloader(new getRequest());

									        this.resoursechooser = (new MyResourseChooser());

									        this.urldownloader = new UrlGet();

									        this.seed = seed;

									        unvisited.add(seed);

									    }

									    //開始爬取的方法

									    private void spiderstart(){

									        String html = null;

									        while(!unvisited.isEmpty()){

									            String url = unvisited.poll();

									            System.out.println("開始獲取"+url);

									            if(resoursechooser.isNeed(url)){

									                try{

									                    html = downloader.downloadhtml(url);

									                }

									                catch(RuntimeException e){

									                    System.out.println(url+"連接獲取失敗");

									                    continue;

									                }

									                visited.add(url);

									                LinkedList<String> urls = new LinkedList<String>();

									                try{

									                    urls = urldownloader.geturls(html);

									                }

									                catch(RuntimeException e){

									                    System.out.println(url+"的html頁面為空");

									                    continue;

									                }

									                Iterator<String> it = urls.iterator();

									                while(it.hasNext()){

									                    String newurl = it.next();

									                    if(resoursechooser.isNeed(newurl)&&!visited.contains(newurl)&&!unvisited.contains(newurl)){

									                        newurl = resoursechooser.process(newurl);

									                        unvisited.add(newurl);

									                        System.out.println(newurl+"加入頁面");

									                    }

									                }

									                System.out.println("獲取了"+url+"上的所有url");

									                if(resoursechooser.isResourse(url)){

									                    saveutil.save(url,html);

									                }

									            }

									        }

									    }

									    //默認(rèn)資源篩選類

									    private class MyResourseChooser implements ResourseChooser{

									        @Override

									        public Boolean isNeed(String url) {

									            // TODO Auto-generated method stub

									            if(!url.startsWith("/")&&!url.startsWith("http")){

									                return false;

									            }

									            return true;

									        }

									        @Override

									        public Boolean isResourse(String url) {

									            // TODO Auto-generated method stub

									            return true;

									        }

									        @Override

									        public String process(String url) {

									            // TODO Auto-generated method stub

									            if(!url.startsWith("http")){

									                url = seed+url;

									            }

									            return url;

									        }

									    }

									    public class getRequest implements RequestSet{

									        public HttpGet getMethod(String url) {

									            // TODO Auto-generated method stub

									            //創(chuàng)建一個(gè)get請求方法

									            HttpGet getmethod = new HttpGet(url);

									            //HttpHost proxy = new HttpHost("124.88.67.81",80);這里不設(shè)置代理IP

									            //設(shè)置請求超時(shí)時(shí)間等 

									            RequestConfig responseconfig = RequestConfig.custom().setConnectionRequestTimeout(10000).setConnectTimeout(10000).setSocketTimeout(10000).build();

									            //設(shè)置請求頭,主要是user-agent

									            getmethod.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");

									            //設(shè)置請求參數(shù)

									            getmethod.setConfig(responseconfig);

									            return getmethod;

									        }

									    }

									    //默認(rèn)的存儲(chǔ)類

									    public class MySaveUtil implements SaveUtil{

									        @Override

									        public void save(String url, String html) {

									            // TODO Auto-generated method stub

									            String filename = getfilename(url);

									            BufferedWriter writer = null;

									            try{

									                writer = new BufferedWriter(new FileWriter(filename));

									                writer.write(html);

									                writer.flush();

									                System.out.println("文件寫入成功");

									            }

									            catch(IOException e){

									                System.out.println("文件寫入失敗");

									            }

									            finally{

									                try {

									                    if(writer != null)

									                    writer.close();

									                }

									                catch (IOException e) {

									                    // TODO Auto-generated catch block

									                    System.out.println("流關(guān)閉失敗");

									                }

									            }

									        }

									        private String getfilename(String url){

									            String fileparentpath = "f://html";

									            File file = new File(fileparentpath);

									            if(!file.exists()){

									                file.mkdir();

									            }

									            int last = url.lastIndexOf(".");

									            int first = url.indexOf(".");

									            url = url.substring(first,last);

									            url = url.replaceAll("\\.", "");

									            url = url.replaceAll("/", "");

									            return fileparentpath+"/"+url+".txt";

									        }

									    }

									}