三大紙のニュースページからの記事抽出

三大紙(:朝日、毎日、読売)のニュースサイトからクロールされたwebページから、記事を抽出するプログラムを書いた。

1つのニュースページからの抽出物は次の3つ

  • 記事タイトル
  • 更新時間
  • 記事内容(画像なし)

実験のためのデータ集めで急いで書きました。あしからず。
ParseNews.java, DataTemplate.java, ParseAsahi.java, ParseMainichi.java, ParseYomiuri.javaの5ファイル構成です。

まず、ParseNewsクラス。クラス名に何度もパースって出てきますが、設計を途中で変更したのでDOMやSAXを使わず正規表現で必要個所を抽出してます。

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;

public class ParseNews {
	/**
	 * @param args
	 * @throws UnsupportedEncodingException 
	 * @throws FileNotFoundException 
	 * @throws IOException 
	 */
	ArrayList<String> filename;
	File output_file;
	int mode=0;//0=asahi, 1=mainichi, 2=yomiuri
	String default_save_file="../output.txt";
	
	public void parser() throws FileNotFoundException, UnsupportedEncodingException, IOException {
		//parse all input files
		switch(mode) {
		case 0:(new ParseAsahi()).scraper(filename,output_file);   break;
		case 1:(new ParseMainichi()).scraper(filename,output_file);break;
		case 2:(new ParseYomiuri()).scraper(filename,output_file); break;
		default:(new ParseAsahi()).scraper(filename,output_file);  break;
		}
	}
	public void initialize(String mode_str, String path_str, String save_file) throws NumberFormatException, FileNotFoundException, UnsupportedEncodingException, IOException {
		//set mode
		try {
			mode=Integer.parseInt(mode_str);
		} catch(NumberFormatException e) {
			e.printStackTrace();
		}
		//set input files
		filename=new ArrayList<String>();
		File path=new File(path_str);
		if(path.isDirectory()) {
			if(path.list().length>0) {
				for(int i=0;i<path.list().length;i++) {
					if(path.listFiles()[i].isFile()) {
						filename.add(path_str+path.list()[i]);
					} else {
						System.out.println("This path is not file:(\nPath is "+path_str+path.list()[i]+".");
					}
				}
			} else {
				System.out.println("This directory has no file:(\nDirectory is "+path_str+".");
			}
		} else {
			System.out.println("This path is not directory:(\nPath is "+path_str+".");
		}
		//set output file
		output_file=new File(save_file);
		if(output_file.exists()) {
			output_file.delete();
		} else {}
		//parse input files and save them to output file
		parser();
	}
	public ParseNews(String mode_str, String path_str, String save_file) throws NumberFormatException, FileNotFoundException, UnsupportedEncodingException, IOException {
		initialize(mode_str, path_str, save_file);
	}
	public ParseNews(String mode_str, String path_str) throws NumberFormatException, FileNotFoundException, UnsupportedEncodingException, IOException {
		initialize(mode_str, path_str, default_save_file);
	}
	public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException, IOException {
		// TODO Auto-generated method stub
		String prefix_input="/Users/sakamotokoutarou/Dropbox/crawler/",
			   prefix_output="/Users/sakamotokoutarou/Documents/workspace/ParseNews/";
		new ParseNews("0",prefix_input+"asahi/",prefix_output+"output_Asahi.txt");
		new ParseNews("1",prefix_input+"mainichi/",prefix_output+"output_Mainichi.txt");
		new ParseNews("2",prefix_input+"yomiuri/",prefix_output+"output_Yomiuri.txt");
/*		command line
 		if(args.length==3){
			new ParseNews(args[0],args[1],args[2]);
		}else if(args.length==2){
			new ParseNews(args[0],args[1]);
		}else{
			System.out.println("The number of arguments is not correct:(");
		}
*/
	}

}

DataTemplateクラス

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;

public class DataTemplate {
	String str="",title="",date="";
	boolean flag=false,flag2=false,flag3=false,flag_title=false,flag_date=false,flag_date2=false;
	ArrayList<String> lines=new ArrayList<String>();
	public void ini() {
		str="";
		title="";
		date="";
		flag=false;
		flag2=false;
		flag_title=false;
		flag_date=false;
		flag_date2=false;
		lines.clear();
	}
	public void print(ArrayList<String>filename, File output_file,int i) throws UnsupportedEncodingException, FileNotFoundException {
		try {
		PrintWriter out=new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output_file,true),"UTF-8")));
		if(!title.isEmpty()) {
			out.println("TITLE="+title);
		} else {
			out.println("TITLE=no title");
			System.out.println("This page does not contain title:(\nFilename is "+filename.get(i)+".");
		}
		if(!date.isEmpty()) {
			out.println("DATE="+date);
		} else {
			out.println("DATE=no date");
			System.out.println("This page does not contain date:(\nFilename is "+filename.get(i)+".");
		}
		if(!lines.isEmpty()) {
			for(int j=0;j<lines.size();j++) {
				out.println(lines.get(j));
			}
		} else {
			System.out.println("This page does not contain any news:(\nFilename is "+filename.get(i)+".");
		}
		//border line between articles
		out.println("---");
		out.close();
		} catch(UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch(FileNotFoundException e) {
			e.printStackTrace();
		}
	}
}

ここからは三紙で共通しない部分の処理のためのクラスについて。それぞれParseAsahiクラス、ParseMainichiクラス、ParseYomiuriクラスを作成した。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ParseAsahi extends DataTemplate {
	String encoding="EUC-JP";
	Pattern pTitle=Pattern.compile("<title>([^<]+)<"),
			pDate=Pattern.compile(".*<dd class=\"FloatR\">([^<]+)<"),
			pAsahi1=Pattern.compile(".*<!--main text start-->"),
			pAsahi2=Pattern.compile(".*<!--main text end-->"),
			pAsahi3=Pattern.compile("^[<p>|</p>]");
	Matcher mTitle,mDate;
	public void scraper(ArrayList<String>filename,File output_file) throws FileNotFoundException, UnsupportedEncodingException, IOException {
		for(int i=0;i<filename.size();i++) {
			try {
				ini();
				File file=new File(filename.get(i));
				if(file.exists()) {
					BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filename.get(i)),encoding));
					while((str=br.readLine())!=null) {
						if(!flag_title) {
							mTitle=pTitle.matcher(str);
							if(mTitle.find()) {
								title=mTitle.group(1);
								flag_title=true;
							} else {}
						} else {
							if(!flag_date) {
								mDate=pDate.matcher(str);
								if(mDate.find()) {
									date=mDate.group(1);
									flag_date=true;
								} else {}
							} else {
								if(pAsahi2.matcher(str).find()) {
									break;
								} else {}
								if(flag&&!pAsahi3.matcher(str).find()) {
									lines.add(str);
								} else {}
								if(pAsahi1.matcher(str).find()) {
									flag=true;
								} else {}
							}
						}
					}
					br.close();
					print(filename,output_file,i);
				} else {
					System.out.println("This file does not exist:(\nFilename is "+filename.get(i)+".");
				}
			} catch(FileNotFoundException e) {
				e.printStackTrace();
			} catch(UnsupportedEncodingException e) {
				e.printStackTrace();
			} catch(IOException e) {
				e.printStackTrace();
			}
		}
	}
}

ParseMainichiクラス。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ParseMainichi extends DataTemplate {
	String encoding="UTF-8";
	Pattern pTitle=Pattern.compile("class=\"NewsTitle\"[^>]*>([^<]+)<"),
			pDate1=Pattern.compile("class=\"Credit"),
			pDate2=Pattern.compile("<p>([^<]+)<"),
			pMainichi1=Pattern.compile("class=\"NewsBody"),
			pMainichi2=Pattern.compile("class=\"Credit"),
			pMainichi2_1=Pattern.compile("<!-- /本文部分 -->"),
			pMainichi3=Pattern.compile("<p>([^<]+)<");
	Matcher mTitle,mDate1,mDate2,mMainichi3;
	public void scraper(ArrayList<String>filename,File output_file) throws FileNotFoundException, UnsupportedEncodingException, IOException {
		for(int i=0;i<filename.size();i++) {
			try {
				ini();
				File file=new File(filename.get(i));
				if(file.exists()) {
					BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filename.get(i)),encoding));
					while((str=br.readLine())!=null) {
						if(!flag_title) {
							mTitle=pTitle.matcher(str);
							if(mTitle.find()) {
								title=mTitle.group(1);
								flag_title=true;
							} else {}
						} else {}
						if(!flag_date) {
							mDate1=pDate1.matcher(str);
							if(mDate1.find()) {
								flag_date=true;
							} else {}
						} else {
							if(!flag_date2) {
								mDate2=pDate2.matcher(str);
								if(mDate2.find()) {
									date=mDate2.group(1);
									flag_date2=true;
								} else {}
							} else {}
						}
						if(flag&&pMainichi2.matcher(str).find()||pMainichi2_1.matcher(str).find()) {
							flag=false;
							flag2=true;
						} else {}
						if(flag) {
							mMainichi3=pMainichi3.matcher(str);
							if(mMainichi3.find()) {
								lines.add(mMainichi3.group(1));
							} else {}
						} else {}
						if(pMainichi1.matcher(str).find()) {
							flag=true;
						} else {}
						if(flag2&&flag_date2) {
							break;
						} else {};
					}
					br.close();
					print(filename,output_file,i);
				}else {
					System.out.println("This file does not exist:(\nFilename is "+filename.get(i)+".");
				}
			} catch(FileNotFoundException e) {
				e.printStackTrace();
			} catch(UnsupportedEncodingException e) {
				e.printStackTrace();
			} catch(IOException e) {
				e.printStackTrace();
			}
		}
	}
}

ParseYomiuriクラス。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ParseYomiuri extends DataTemplate {
	String encoding="Shift-JIS";
	Pattern pTitle=Pattern.compile(".*<title>([^<]+)<"),
			pDate=Pattern.compile(".*<!--// date_start //-->([^<]+)<"),
			pYomiuri1=Pattern.compile("<!-- google_ad_region_start=region1 -->"),
			pYomiuri2=Pattern.compile("<!-- google_ad_region_end=region1 -->"),
			pYomiuri3=Pattern.compile("<p[^>]*>([^<]+)<");
	Matcher mTitle,mDate,mYomiuri3;
	public void scraper(ArrayList<String>filename,File output_file) throws FileNotFoundException, UnsupportedEncodingException, IOException {
		for(int i=0;i<filename.size();i++) {
			try {
				ini();
				File file=new File(filename.get(i));
				if(file.exists()) {
					BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filename.get(i)),encoding));
					while((str=br.readLine())!=null) {
						if(!flag_title) {
							mTitle=pTitle.matcher(str);
							if(mTitle.find()) {
								title=mTitle.group(1);
								flag_title=true;
							} else {}
						} else {
							if(!flag_date) {
								mDate=pDate.matcher(str);
								if(mDate.find()) {
									date=mDate.group(1);
									flag_date=true;
								} else {}
							} else {}
							if(pYomiuri2.matcher(str).find()) {
								flag2=true;
							} else {}
							if(flag) {
								mYomiuri3=pYomiuri3.matcher(str);
								if(mYomiuri3.find()) {
									lines.add(mYomiuri3.group(1));
								} else {}
							}
							if(pYomiuri1.matcher(str).find()) {
								flag=true;
							} else {}
							if(flag_date&&flag2) {
								break;
							} else {}
						}
					}
					br.close();
					print(filename,output_file,i);
				} else {
					System.out.println("This file does not exist:(\nFilename is "+filename.get(i)+".");
				}
			} catch(FileNotFoundException e) {
				e.printStackTrace();
			} catch(UnsupportedEncodingException e) {
				e.printStackTrace();
			} catch(IOException e) {
				e.printStackTrace();
			}
		}
	}
}