Java写的爬虫的基本程序Word下载.docx
- 文档编号:21764127
- 上传时间:2023-02-01
- 格式:DOCX
- 页数:13
- 大小:20.50KB
Java写的爬虫的基本程序Word下载.docx
《Java写的爬虫的基本程序Word下载.docx》由会员分享,可在线阅读,更多相关《Java写的爬虫的基本程序Word下载.docx(13页珍藏版)》请在冰豆网上搜索。
8.
9./*disallowListCache缓存robot不允许搜索的URL。
Robot协议在Web站点的根目录下设置一个robots.txt文件,
10.*规定站点上的哪些页面是限制搜索的。
搜索程序应该在搜索过程中跳过这些区域,下面是robots.txt的一个例子:
11.#robots.txtfor
12.User-agent:
*
13.Disallow:
/cgi-bin/
14.Disallow:
/registration#/Disallowrobotsonregistrationpage
15.Disallow:
/login
16.*/
17.
18.
19.privateHashMap<
String,ArrayList<
String>
>
disallowListCache=newHashMap<
();
20.ArrayList<
errorList=newArrayList<
//错误信息
21.ArrayList<
result=newArrayList<
//搜索到的结果
22.StringstartUrl;
//开始搜索的起点
23.intmaxUrl;
//最大处理的url数
24.StringsearchString;
//要搜索的字符串(英文)
25.booleancaseSensitive=false;
//是否区分大小写
26.booleanlimitHost=false;
//是否在限制的主机内搜索
27.
28.publicSearchCrawler(StringstartUrl,intmaxUrl,StringsearchString){
29.this.startUrl=startUrl;
30.this.maxUrl=maxUrl;
31.this.searchString=searchString;
32.}
33.
34.publicArrayList<
getResult(){
35.returnresult;
36.}
37.
38.publicvoidrun(){//启动搜索线程
39.
40.crawl(startUrl,maxUrl,searchString,limitHost,caseSensitive);
41.}
42.
43.
44.//检测URL格式
45.privateURLverifyUrl(Stringurl){
46.//只处理HTTPURLs.
47.if(!
url.toLowerCase().startsWith("
//"
))
48.returnnull;
49.
50.URLverifiedUrl=null;
51.try{
52.verifiedUrl=newURL(url);
53.}catch(Exceptione){
54.returnnull;
55.}
56.
57.returnverifiedUrl;
58.}
59.
60.//检测robot是否允许访问给出的URL.
61.privatebooleanisRobotAllowed(URLurlToCheck){
62.Stringhost=urlToCheck.getHost().toLowerCase();
//获取给出RUL的主机
63.//System.out.println("
主机="
+host);
64.
65.//获取主机不允许搜索的URL缓存
66.ArrayList<
disallowList=disallowListCache.get(host);
67.
68.//如果还没有缓存,下载并缓存。
69.if(disallowList==null){
70.disallowList=newArrayList<
71.try{
72.URLrobotsFileUrl=newURL("
+host+"
/robots.txt"
);
73.BufferedReaderreader=newBufferedReader(newInputStreamReader(robotsFileUrl.openStream()));
74.
75.//读robot文件,创建不允许访问的路径列表。
76.Stringline;
77.while((line=reader.readLine())!
=null){
78.if(line.indexOf("
Disallow:
"
)==0){//是否包含"
79.StringdisallowPath=line.substring("
.length());
//获取不允许访问路径
80.
81.//检查是否有注释。
82.intcommentIndex=disallowPath.indexOf("
#"
83.if(commentIndex!
=-1){
84.disallowPath=disallowPath.substring(0,commentIndex);
//去掉注释
85.}
86.
87.disallowPath=disallowPath.trim();
88.disallowList.add(disallowPath);
89.}
90.}
91.
92.//缓存此主机不允许访问的路径。
93.disallowListCache.put(host,disallowList);
94.}catch(Exceptione){
95.returntrue;
//web站点根目录下没有robots.txt文件,返回真
96.}
97.}
98.
99.
100.Stringfile=urlToCheck.getFile();
101.//System.out.println("
文件getFile()="
+file);
102.for(inti=0;
i<
disallowList.size();
i++){
103.Stringdisallow=disallowList.get(i);
104.if(file.startsWith(disallow)){
105.returnfalse;
106.}
107.}
108.
109.returntrue;
110.}
111.
112.
113.
114.
115.privateStringdownloadPage(URLpageUrl){
116.try{
117.//OpenconnectiontoURLforreading.
118.BufferedReaderreader=
119.newBufferedReader(newInputStreamReader(pageUrl.openStream()));
120.
121.//Readpageintobuffer.
122.Stringline;
123.StringBufferpageBuffer=newStringBuffer();
124.while((line=reader.readLine())!
125.pageBuffer.append(line);
126.}
127.
128.returnpageBuffer.toString();
129.}catch(Exceptione){
130.}
131.
132.returnnull;
133.}
134.
135.//从URL中去掉"
www"
136.privateStringremoveWwwFromUrl(Stringurl){
137.intindex=url.indexOf("
:
//www."
138.if(index!
139.returnurl.substring(0,index+3)+
140.url.substring(index+7);
141.}
142.
143.return(url);
144.}
145.
146.//解析页面并找出链接
147.privateArrayList<
retrieveLinks(URLpageUrl,StringpageContents,HashSetcrawledList,
148.booleanlimitHost)
149.{
150.//用正则表达式编译链接的匹配模式。
151.Patternp=Ppile("
<
a\\s+href\\s*=\\s*\"
?
(.*?
)[\"
|>
]"
Pattern.CASE_INSENSITIVE);
152.Matcherm=p.matcher(pageContents);
153.
154.
155.ArrayList<
linkList=newArrayList<
156.while(m.find()){
157.Stringlink=m.group
(1).trim();
158.
159.if(link.length()<
1){
160.continue;
161.}
162.
163.//跳过链到本页面内链接。
164.if(link.charAt(0)=='
#'
){
165.continue;
166.}
167.
168.
169.if(link.indexOf("
mailto:
)!
170.continue;
171.}
172.
173.if(link.toLowerCase().indexOf("
javascript"
174.continue;
175.}
176.
177.if(link.indexOf("
)==-1){
178.if(link.charAt(0)=='
/'
){//处理绝对地
179.link="
+pageUrl.getHost()+"
+pageUrl.getPort()+link;
180.}else{
181.Stringfile=pageUrl.getFile();
182.if(file.indexOf('
)==-1){//处理相对地址
183.link="
+pageUrl.getPort()+"
/"
+link;
184.}else{
185.Stringpath=file.substring(0,file.lastIndexOf('
)+1);
186.link="
+pageUrl.getPort()+path+link;
187.}
188.}
189.}
190.
191.intindex=link.indexOf('
192.if(index!
193.link=link.substring(0,index);
194.}
195.
196.link=removeWwwFromUrl(link);
197.
198.URLverifiedLink=verifyUrl(link);
199.if(verifiedLink==null){
200.continue;
201.}
202.
203./*如果限定主机,排除那些不合条件的URL*/
204.if(limitHost&
&
205.!
pageUrl.getHost().toLowerCase().equals(
206.verifiedLink.getHost().toLowerCase()))
207.{
208.continue;
209.}
210.
211.//跳过那些已经处理的链接.
212.if(crawledList.contains(link)){
213.continue;
214.}
215.
216.linkList.add(link);
217.}
218.
219.return(linkList);
220.}
221.
222.//搜索下载Web页面的内容,判断在该页面内有没有指定的搜索字符串
223.
224.privatebooleansearchStringMatches(StringpageContents,StringsearchString,booleancaseSensitive){
225.StringsearchContents=pageContents;
226.if(!
caseSensitive){//如果不区分大小写
227.searchContents=pageContents.toLowerCase();
228.}
229.
230.
231.Patternp=Ppile("
[\\s]+"
232.String[]terms=p.split(searchString);
233.for(inti=0;
terms.length;
234.if(caseSensitive){
235.if(searchContents.indexOf(terms[i])==-1){
236.returnfalse;
237.}
238.}else{
239.if(searchContents.indexOf(terms[i].toLowerCase())==-1){
240.returnfalse;
241.}
242.}}
243.
244.returntrue;
245.}
246.
247.
248.//执行实际的搜索操作
249.publicArrayList<
crawl(StringstartUrl,intmaxUrls,StringsearchString,booleanlimithost,booleancaseSensitive)
250.{
251.
252.System.out.println("
searchString="
+searchString);
253.HashSet<
crawledList=newHashSet<
254.LinkedHashSet<
toCrawlList=newLinkedHashSet<
255.
256.if(maxUrls<
257.errorList.add("
InvalidMaxURLsvalue."
258.System.out.println("
259.}
260.
261.
262.if(searchString.length()<
263.errorList.add("
MissingSearchString."
264.System.out.println("
MissingsearchString"
265.}
266.
267.
268.if(errorList.size()>
0){
269.System.out.println("
err!
!
270.returnerrorList;
271.}
272.
273.
274.//从开始URL中移出www
275.startUrl=removeWwwFromUrl(startUrl);
276.
277.
278.toCrawlList.add(startUrl);
279.while(toCrawlList.size()>
280.
281.if(maxUrls!
282.if(crawledList.size()==maxUrls){
283.break;
284.}
285.}
286.
287.//GetURLatbottomofthelist.
288.Stringurl=toCrawlList.iterator().next();
289.
290.//RemoveURLfromthetocrawllist.
291.toCrawlList.remove(url);
292.
293.//ConvertstringurltoURLobject.
294.URLverifiedUrl=verifyUrl(url);
295.
296.//SkipURLifrobotsarenotallowedtoaccessit.
297.if(!
isRobotAllowed(verifiedUrl)){
298.continue;
299.}
300.
301.
302.//增加已处理的URL到crawledList
303.crawledList.add(url);
304.StringpageContents=downloadPage(verifiedUrl);
305.
306.
307.if(pageContents!
=null&
pageContents.length()>
308.//从页面中获取有效的链接
309.ArrayList<
links=retrieveLinks(verifiedUrl,pageContents,crawledList,limitHost);
310.
311.toCrawlList.addAll(links);
312.
313.if(searchStringMatches(pageContents,searchString,caseSensitive))
314.{
315.result.add(url);
316.System.out.println(url);
31
- 配套讲稿:
如PPT文件的首页显示word图标,表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
- 特殊限制:
部分文档作品中含有的国旗、国徽等图片,仅作为作品整体效果示例展示,禁止商用。设计者仅对作品中独创性部分享有著作权。
- 关 键 词:
- Java 爬虫 基本 程序