三大紙のニュースページからの記事抽出
三大紙(:朝日、毎日、読売)のニュースサイトからクロールされたwebページから、記事を抽出するプログラムを書いた。
1つのニュースページからの抽出物は次の3つ
- 記事タイトル
- 更新時間
- 記事内容(画像なし)
実験のためのデータ集めで急いで書きました。あしからず。
ParseNews.java, DataTemplate.java, ParseAsahi.java, ParseMainichi.java, ParseYomiuri.javaの5ファイル構成です。
まず、ParseNewsクラス。クラス名に何度もパースって出てきますが、設計を途中で変更したのでDOMやSAXを使わず正規表現で必要個所を抽出してます。
import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; public class ParseNews { /** * @param args * @throws UnsupportedEncodingException * @throws FileNotFoundException * @throws IOException */ ArrayList<String> filename; File output_file; int mode=0;//0=asahi, 1=mainichi, 2=yomiuri String default_save_file="../output.txt"; public void parser() throws FileNotFoundException, UnsupportedEncodingException, IOException { //parse all input files switch(mode) { case 0:(new ParseAsahi()).scraper(filename,output_file); break; case 1:(new ParseMainichi()).scraper(filename,output_file);break; case 2:(new ParseYomiuri()).scraper(filename,output_file); break; default:(new ParseAsahi()).scraper(filename,output_file); break; } } public void initialize(String mode_str, String path_str, String save_file) throws NumberFormatException, FileNotFoundException, UnsupportedEncodingException, IOException { //set mode try { mode=Integer.parseInt(mode_str); } catch(NumberFormatException e) { e.printStackTrace(); } //set input files filename=new ArrayList<String>(); File path=new File(path_str); if(path.isDirectory()) { if(path.list().length>0) { for(int i=0;i<path.list().length;i++) { if(path.listFiles()[i].isFile()) { filename.add(path_str+path.list()[i]); } else { System.out.println("This path is not file:(\nPath is "+path_str+path.list()[i]+"."); } } } else { System.out.println("This directory has no file:(\nDirectory is "+path_str+"."); } } else { System.out.println("This path is not directory:(\nPath is "+path_str+"."); } //set output file output_file=new File(save_file); if(output_file.exists()) { output_file.delete(); } else {} //parse input files and save them to output file parser(); } public ParseNews(String mode_str, String path_str, String save_file) throws NumberFormatException, FileNotFoundException, UnsupportedEncodingException, IOException { initialize(mode_str, path_str, save_file); } public ParseNews(String mode_str, String path_str) throws NumberFormatException, FileNotFoundException, UnsupportedEncodingException, IOException { initialize(mode_str, path_str, default_save_file); } public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException, IOException { // TODO Auto-generated method stub String prefix_input="/Users/sakamotokoutarou/Dropbox/crawler/", prefix_output="/Users/sakamotokoutarou/Documents/workspace/ParseNews/"; new ParseNews("0",prefix_input+"asahi/",prefix_output+"output_Asahi.txt"); new ParseNews("1",prefix_input+"mainichi/",prefix_output+"output_Mainichi.txt"); new ParseNews("2",prefix_input+"yomiuri/",prefix_output+"output_Yomiuri.txt"); /* command line if(args.length==3){ new ParseNews(args[0],args[1],args[2]); }else if(args.length==2){ new ParseNews(args[0],args[1]); }else{ System.out.println("The number of arguments is not correct:("); } */ } }
DataTemplateクラス
import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; public class DataTemplate { String str="",title="",date=""; boolean flag=false,flag2=false,flag3=false,flag_title=false,flag_date=false,flag_date2=false; ArrayList<String> lines=new ArrayList<String>(); public void ini() { str=""; title=""; date=""; flag=false; flag2=false; flag_title=false; flag_date=false; flag_date2=false; lines.clear(); } public void print(ArrayList<String>filename, File output_file,int i) throws UnsupportedEncodingException, FileNotFoundException { try { PrintWriter out=new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output_file,true),"UTF-8"))); if(!title.isEmpty()) { out.println("TITLE="+title); } else { out.println("TITLE=no title"); System.out.println("This page does not contain title:(\nFilename is "+filename.get(i)+"."); } if(!date.isEmpty()) { out.println("DATE="+date); } else { out.println("DATE=no date"); System.out.println("This page does not contain date:(\nFilename is "+filename.get(i)+"."); } if(!lines.isEmpty()) { for(int j=0;j<lines.size();j++) { out.println(lines.get(j)); } } else { System.out.println("This page does not contain any news:(\nFilename is "+filename.get(i)+"."); } //border line between articles out.println("---"); out.close(); } catch(UnsupportedEncodingException e) { e.printStackTrace(); } catch(FileNotFoundException e) { e.printStackTrace(); } } }
ここからは三紙で共通しない部分の処理のためのクラスについて。それぞれParseAsahiクラス、ParseMainichiクラス、ParseYomiuriクラスを作成した。
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ParseAsahi extends DataTemplate { String encoding="EUC-JP"; Pattern pTitle=Pattern.compile("<title>([^<]+)<"), pDate=Pattern.compile(".*<dd class=\"FloatR\">([^<]+)<"), pAsahi1=Pattern.compile(".*<!--main text start-->"), pAsahi2=Pattern.compile(".*<!--main text end-->"), pAsahi3=Pattern.compile("^[<p>|</p>]"); Matcher mTitle,mDate; public void scraper(ArrayList<String>filename,File output_file) throws FileNotFoundException, UnsupportedEncodingException, IOException { for(int i=0;i<filename.size();i++) { try { ini(); File file=new File(filename.get(i)); if(file.exists()) { BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filename.get(i)),encoding)); while((str=br.readLine())!=null) { if(!flag_title) { mTitle=pTitle.matcher(str); if(mTitle.find()) { title=mTitle.group(1); flag_title=true; } else {} } else { if(!flag_date) { mDate=pDate.matcher(str); if(mDate.find()) { date=mDate.group(1); flag_date=true; } else {} } else { if(pAsahi2.matcher(str).find()) { break; } else {} if(flag&&!pAsahi3.matcher(str).find()) { lines.add(str); } else {} if(pAsahi1.matcher(str).find()) { flag=true; } else {} } } } br.close(); print(filename,output_file,i); } else { System.out.println("This file does not exist:(\nFilename is "+filename.get(i)+"."); } } catch(FileNotFoundException e) { e.printStackTrace(); } catch(UnsupportedEncodingException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } } } }
ParseMainichiクラス。
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ParseMainichi extends DataTemplate { String encoding="UTF-8"; Pattern pTitle=Pattern.compile("class=\"NewsTitle\"[^>]*>([^<]+)<"), pDate1=Pattern.compile("class=\"Credit"), pDate2=Pattern.compile("<p>([^<]+)<"), pMainichi1=Pattern.compile("class=\"NewsBody"), pMainichi2=Pattern.compile("class=\"Credit"), pMainichi2_1=Pattern.compile("<!-- /本文部分 -->"), pMainichi3=Pattern.compile("<p>([^<]+)<"); Matcher mTitle,mDate1,mDate2,mMainichi3; public void scraper(ArrayList<String>filename,File output_file) throws FileNotFoundException, UnsupportedEncodingException, IOException { for(int i=0;i<filename.size();i++) { try { ini(); File file=new File(filename.get(i)); if(file.exists()) { BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filename.get(i)),encoding)); while((str=br.readLine())!=null) { if(!flag_title) { mTitle=pTitle.matcher(str); if(mTitle.find()) { title=mTitle.group(1); flag_title=true; } else {} } else {} if(!flag_date) { mDate1=pDate1.matcher(str); if(mDate1.find()) { flag_date=true; } else {} } else { if(!flag_date2) { mDate2=pDate2.matcher(str); if(mDate2.find()) { date=mDate2.group(1); flag_date2=true; } else {} } else {} } if(flag&&pMainichi2.matcher(str).find()||pMainichi2_1.matcher(str).find()) { flag=false; flag2=true; } else {} if(flag) { mMainichi3=pMainichi3.matcher(str); if(mMainichi3.find()) { lines.add(mMainichi3.group(1)); } else {} } else {} if(pMainichi1.matcher(str).find()) { flag=true; } else {} if(flag2&&flag_date2) { break; } else {}; } br.close(); print(filename,output_file,i); }else { System.out.println("This file does not exist:(\nFilename is "+filename.get(i)+"."); } } catch(FileNotFoundException e) { e.printStackTrace(); } catch(UnsupportedEncodingException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } } } }
ParseYomiuriクラス。
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ParseYomiuri extends DataTemplate { String encoding="Shift-JIS"; Pattern pTitle=Pattern.compile(".*<title>([^<]+)<"), pDate=Pattern.compile(".*<!--// date_start //-->([^<]+)<"), pYomiuri1=Pattern.compile("<!-- google_ad_region_start=region1 -->"), pYomiuri2=Pattern.compile("<!-- google_ad_region_end=region1 -->"), pYomiuri3=Pattern.compile("<p[^>]*>([^<]+)<"); Matcher mTitle,mDate,mYomiuri3; public void scraper(ArrayList<String>filename,File output_file) throws FileNotFoundException, UnsupportedEncodingException, IOException { for(int i=0;i<filename.size();i++) { try { ini(); File file=new File(filename.get(i)); if(file.exists()) { BufferedReader br=new BufferedReader(new InputStreamReader(new FileInputStream(filename.get(i)),encoding)); while((str=br.readLine())!=null) { if(!flag_title) { mTitle=pTitle.matcher(str); if(mTitle.find()) { title=mTitle.group(1); flag_title=true; } else {} } else { if(!flag_date) { mDate=pDate.matcher(str); if(mDate.find()) { date=mDate.group(1); flag_date=true; } else {} } else {} if(pYomiuri2.matcher(str).find()) { flag2=true; } else {} if(flag) { mYomiuri3=pYomiuri3.matcher(str); if(mYomiuri3.find()) { lines.add(mYomiuri3.group(1)); } else {} } if(pYomiuri1.matcher(str).find()) { flag=true; } else {} if(flag_date&&flag2) { break; } else {} } } br.close(); print(filename,output_file,i); } else { System.out.println("This file does not exist:(\nFilename is "+filename.get(i)+"."); } } catch(FileNotFoundException e) { e.printStackTrace(); } catch(UnsupportedEncodingException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } } } }