网页爬虫

作者 zhan-bin 日期 2018-06-22
网页爬虫

网页代码捕获

从网页上捕获网页的代码随后进行正则匹配

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

package GetData;



import java.io.BufferedReader;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStreamReader;

import java.io.PrintWriter;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class GetWebsiteData {

public static void main(String[] args)

{

URL url = null;

URLConnection urlconn = null;

BufferedReader br = null;

PrintWriter pw = null;

String regex = "http://[\\w+\\.?/?]+\\.[A-Za-z]+";

Pattern p = Pattern.compile(regex);

try {

url = new URL("http://www.baidu.com/");

urlconn = url.openConnection();

pw = new PrintWriter(new FileWriter("f:/url(baidu).txt"), true);//这里我把爬到的链接存储在了F盘底下的一个叫做url(baidu)的doc文件中

br = new BufferedReader(new InputStreamReader(

urlconn.getInputStream()));

String buf = null;

while ((buf = br.readLine()) != null) {

Matcher buf_m = p.matcher(buf);

while (buf_m.find()) {

pw.println(buf_m.group());

}

}

System.out.println("获取成功!");

} catch (MalformedURLException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

} finally {

try {

br.close();

} catch (IOException e) {

e.printStackTrace();

}

pw.close();

}

}



}