`

网页爬虫程序pageSpider

阅读更多
2009-05-05 19:44

该程序仅对单个URL所对应的page网页信息进行抓取(pageSpider.java)。程序流程图如下:

 

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;


public class pageSpider implements Runnable {

HttpURLConnection httpUrlConnection;
InputStream inputStream;
BufferedReader bufferedReader;
String url;

public pageSpider() {

    try {url="http://www.baidu.com"; } catch (Exception e) {e.printStackTrace();}

    try {
     httpUrlConnection = (HttpURLConnection) new URL(url).openConnection(); //创建连接
    } catch (MalformedURLException e) {
     e.printStackTrace();
    } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
    }

    System.out.println("---------start-----------");

    Thread thread = new Thread(this);
    thread.start();
    try {thread.join();} catch (InterruptedException e) {e.printStackTrace();}

    System.out.println("----------end------------");
}

public void run() {
    // TODO Auto-generated method stub
    try {
     httpUrlConnection.setRequestMethod("GET");
    } catch (ProtocolException e) {
     e.printStackTrace();
    }

    try {
     httpUrlConnection.setUseCaches(true); //使用缓存
     httpUrlConnection.connect();           //建立连接
    } catch (IOException e) {
     e.printStackTrace();
    }

    try {
     inputStream = httpUrlConnection.getInputStream(); //读取输入流
     bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "gb2312"));
     String string;
     while ((string = bufferedReader.readLine()) != null) {
        System.out.println(string); //打印输出
     }
    } catch (IOException e) {
     e.printStackTrace();
    } finally {
     try {
      bufferedReader.close();
      inputStream.close();
      httpUrlConnection.disconnect();
     } catch (IOException e) {
      e.printStackTrace();
     }

    }

}

public static void main(String[] args) {
    new pageSpider();
}

}

 

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics