java使用正则表达式抓取网页内容存为txt

作者：the5fire | 标签： java 正则表达式 | 发布：2011-06-02 2:47 p.m. | 阅读量: 12541, 12190

前几天女友在网上看了一本电子书，想要下载下来，不过那个网站只能支持在线阅读，不提供下载，还好可以复制粘贴。
于是这个复制粘贴的任务便交给了我，看了一下网站url，单篇文章的html源码都很简单，作为一个程序员怎么可以重复的复制粘贴呢？
于是有了这个代码，比较简单：


package WEB;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 网页抓取
 * @author 胡阳
 * @blog http://www.the5fire.com
 *
 */
public class WebGet {
	private String myUrl;
	private HttpURLConnection con;
	private StringBuilder contextAll = new StringBuilder("");

	private int pageCount = 0;
	private String pageType = "";
	public WebGet() {

	}

	public WebGet(String url) {
		this.myUrl = url;
	}

	public WebGet(String url,int pageCount,String pageType) {
		this.myUrl = url;
		this.pageCount = pageCount;
		this.pageType = pageType;
	}

	/**
	 * 正则表达式
	 * */
	public String regex() {
		String googleRegex = "";
		return googleRegex;
	}

	public void init(String url, String page) throws IOException {
		this.myUrl = "http://www.tianyabook.com/qita/hougeixue/";
		this.init(page);
	}

	public void init(String page) throws IOException {
		if (myUrl != null && !myUrl.equals("")) {
			URL urlmy = new URL(myUrl + page + ".html");
			con = (HttpURLConnection) urlmy.openConnection();
			con.setFollowRedirects(true);
			con.setInstanceFollowRedirects(false);
			con.connect();
		}
	}

	/**
	 * 写字符串中数据到txt文件
	 * @param context
	 * @return
	 * @throws IOException
	 */
	public boolean writeTxt(String context,String filePath) throws IOException {
		System.out.println("开始写文件。。");
		OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(
				filePath));

		osw.write(context, 0, context.length());
		osw.flush();
		osw.close();

		return true;
	}

	/**
	 * 获得网页内容，要指定编码格式
	 * @param codeType GB2312/UTF-8/……
	 * @return
	 * @throws IOException 
	 * @throws  
	 */
	public String getContent(String codeType) throws IOException{
		if(pageCount < 1){
			return "null";
		}
		System.out.println("开始抓取内容。。。。。");
		for (int i = 1; i < pageCount; i++) {
			System.out.println("抓取第 " + i + "页");
			this.init(String.valueOf(i));
			BufferedReader br = new BufferedReader(new InputStreamReader(con
					.getInputStream(), codeType));
			String s = "";
			StringBuffer sb = new StringBuffer("");
			while ((s = br.readLine()) != null) {
				sb.append(s);
			}

			String result = sb.toString();
			Pattern pattern = Pattern.compile(regex());
			Matcher matcher = pattern.matcher(result);

			while (matcher.find()) {
				String title = matcher.group().replaceAll("<.*?>", "")
						.replaceAll(" ", "");

				contextAll.append(title + "\n\t");
			}
			System.out.println("完成：" + i + "页");
			System.out.println("");
		}

		return contextAll.toString();
	}

	public static void main(String[] args) throws IOException {

		WebGet wg = new WebGet("http://www.tianyabook.com/qita/hougeixue/",227,"html");
		try {
			if (wg.writeTxt(wg.getContent("GB2312"),"D:\\houhei.txt")) {
				System.out.println("完成");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

	}
}

- from the5fire.com

----EOF-----

微信公众号：Python程序员杂谈

分享到：

相关文章

别人正在读

【上一篇】重温设计模式之抽象工厂
【下一篇】解决java发送邮件没有主题且乱码

其他分类：

the5fire

java使用正则表达式抓取网页内容存为txt

公告

个人作品

热门排行（时间加权）

系列文章

最新文章

点击排行

推荐阅读

.