抓取网页链接

时间:2014-05-04 17:35:41   收藏:0   阅读:341

package com.smilezl.scrapy;


import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.URL;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.Statement;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;


public class ScrapyUrl {

/**

* 解析网页链接

* @param htmlUrl

* @throws IOException

*/

public static List<String> parserHtml(String htmlUrl) {

List<String> list = new ArrayList<String>();

try {

URL url = new URL(htmlUrl);

HttpURLConnection connection = (HttpURLConnection) url.openConnection();

connection.setDoOutput(true);

String contenttype = connection.getContentType();

String charSet = getCharset(contenttype);

if (charSet == null)

charSet = "UTF-8";

InputStreamReader isr = new InputStreamReader(connection.getInputStream(), charSet);

BufferedReader br = new BufferedReader(isr);

String str = null, rs = null;

while ((str = br.readLine()) != null) {

rs = getHref(str, htmlUrl);

if (rs != null && !list.contains(rs))

list.add(rs);

}

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return list;

}

/**

* 获取网页编码方式

* @param str

* @return

*/

public static String getCharset(String str) {

Pattern pattern = Pattern.compile("charset=.*");

Matcher matcher = pattern.matcher(str);

if (matcher.find()) {

return matcher.group(0).split("charset=")[1];

}

return null;

}

/**

* 从一行字符串中读取链接

* @param str

* @return

*/

public static String getHref(String str, String htmlUrl) {

String patternStr = "(http://|https://){1}[\\w\\.\\-/:]+";

//String patternStr = "[^\\s]*((<\\s*[aA]\\s+(href\\s*=[^>]+\\s*)>)(.*)</[aA]>).*";

Pattern pattern = Pattern.compile(patternStr);

Matcher matcher = pattern.matcher(str);

if (matcher.find()){

return matcher.group(0);

} else {

//相对位置截取

String RelPatternStr = "href=\"/.*(html){1}";

pattern = Pattern.compile(RelPatternStr);

matcher = pattern.matcher(str);

if (matcher.find()) {

return matcher.group(0).replace("href=\"/", htmlUrl);

}

}

return null;


}

/**

* 保存链接

* @param url

*/

public static void saveUrlList(String hrefurl) {

try {

Class.forName("org.postgresql.Driver").newInstance();

String url = "jdbc:postgresql://localhost:5432/mydb?useUnicode=true&amp;characterEncoding=gbk";

Connection con = DriverManager.getConnection(url, "postgres", "password");

Statement st = con.createStatement();


List<String> list = parserHtml(hrefurl);

for (int i = 0; i < list.size(); i++) {

String sql = "insert into scrapyurl(url,type) values(‘" + list.get(i) + "‘,0)";

System.out.println(list.get(i));

st.execute(sql);

}

st.close();

con.close();

} catch (Exception e) {

e.printStackTrace();

}

}

public static void main(String[] args) {

saveUrlList("http://fo.ifeng.com/fojiaomeiwen/list_0/0.shtml");


}


}


抓取网页链接,布布扣,bubuko.com

评论(0
© 2014 mamicode.com 版权所有 京ICP备13008772号-2  联系我们:gaon5@hotmail.com
迷上了代码!