正規表現で、とにかく http で始まる URL らしきものの取得


  VS2010(C#)




using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using System.IO;

namespace RegexTest {
	class Program {
		static void Main(string[] args) {

			// インターネットアクセス
			WebClient client = new WebClient();
			client.Encoding = Encoding.UTF8;
			//client.Encoding = Encoding.GetEncoding("shift_jis");
			//client.Encoding = Encoding.GetEncoding("euc-jp");
			string result = client.DownloadString("http://gigazine.net/");

			// 書き込み用テキストファイルの準備
			FileStream fs = new FileStream("result.txt",FileMode.Create,FileAccess.Write);
			StreamWriter sw = new StreamWriter(fs,Encoding.GetEncoding("shift_jis"));

			sw.WriteLine("SHIFT_JIS で書き込んでいます");

			MatchCollection mc = Regex.Matches(result, "(https?://.+?)[\"']?[;)>\\s]");
			foreach (Match match in mc) {
				sw.WriteLine( match.Groups[1] );
			}

			sw.Close();
			sw.Dispose();
			fs.Close();
			fs.Dispose();

		}
	}
}


  PHP

<?php
header( "Content-Type: text/html; Charset=utf-8" );
header( "pragma: no-cache" );
header( "Expires: Wed, 31 May 2000 14:59:58 GMT" );
header( "Cache-control: no-cache" );
//Cache-Control: private, no-store, no-cache, must-revalidate

$result = file_get_contents("http://gigazine.net/");

preg_match_all("|(https?://.+?)[\"']?[;)>\\s]|u", $result, $matches, PREG_PATTERN_ORDER );

print "<pre>";
foreach( $matches[1] as $value ) {
	print($value . "\n");
}
print "</pre>";


?>


  Java

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class HttpGetAndRegex {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		
		try {
			URL url = new URL("http://gigazine.net/");
			// 接続オブジェクト
			HttpURLConnection http = (HttpURLConnection)url.openConnection();
			http.setRequestMethod("GET");
			// 接続 
			http.connect();
			
			// http から InputStream を取得する
			InputStream i_stream = http.getInputStream();
			
			// InputStream から リーダを作成する( キャラクタセットを指定 )
			// UTF-8 でリーダーを作成( インターネット上のデータが UTF-8 なので )
			InputStreamReader i_stream_reader = new InputStreamReader(i_stream, "UTF-8");
			
			// リーダを行単位で読み込める BufferedReader を使って全ての文字列を取得する )
			BufferedReader buffer_reader = new BufferedReader(i_stream_reader);
			
			String result_string = "";
			String line_buffer = null;   
			// BufferedReader は、readLine が null を返すと読み込み終了   
			while ( null != (line_buffer = buffer_reader.readLine() ) ) {   
				result_string += line_buffer;
			}
			
			// 全て閉じる
			buffer_reader.close();
			i_stream_reader.close();
			i_stream.close();
			http.disconnect();
			
			// **************************************************
			// 書き出し用テキストファイルの用意
			// http://docs.oracle.com/javase/jp/7/technotes/guides/intl/encoding.doc.html
			// **************************************************
			PrintWriter pw = new PrintWriter(".\\result.txt", "SHIFT_JIS");
//			PrintWriter pw = new PrintWriter(".\\result.txt", "EUC-JP");
			pw.println( "SHIFT_JIS で書き込んでいます" );
			
			// **************************************************
			// 正規表現による検索開始
			// **************************************************
			String regex = "(https?://.+?)[\"']?[;)>\\s]";
			Pattern pattern = Pattern.compile(regex);

			Matcher matcher = pattern.matcher(result_string);
			while(matcher.find()){
				System.out.println(matcher.group(1));
				pw.println( matcher.group(1) );
			}
			
			pw.flush();
			pw.close();
			
		}
		catch( Exception e ) {
			e.printStackTrace();
		}
		

	}

}





yahoo  google  MSDN  MSDN(us)  WinFAQ  Win Howto  tohoho  ie_DHTML  vector  wdic  辞書  天気 


[regex]
CCBot/2.0 (https://commoncrawl.org/faq/)
24/11/14 04:12:06
InfoBoard Version 1.00 : Language=Perl

1 BatchHelper COMprog CommonSpec Cprog CprogBase CprogSAMPLE CprogSTD CprogSTD2 CprogWinsock Cygwin GameScript HTML HTMLcss InstallShield InstallShieldFunc JScript JScriptSAMPLE Jsfuncs LLINK OldProg OracleGold OracleSilver PRO PRObrowser PROc PROconePOINT PROcontrol PROftpclient PROjscript PROmailer PROperl PROperlCHAT PROphp PROphpLesson PROphpLesson2 PROphpLesson3 PROphpfunction PROphpfunctionArray PROphpfunctionMisc PROphpfunctionString PROsql PROvb PROvbFunction PROvbString PROvbdbmtn PROvbonepoint PROwebapp PROwin1POINT PROwinSYSTEM PROwinYOROZU PROwindows ProjectBoard RealPHP ScriptAPP ScriptMaster VBRealtime Vsfuncs a1root access accreq adsi ajax amazon argus asp aspSample aspVarious aspdotnet aw2kinst cappvariety centura ckeyword classStyle cmaterial cmbin cmdbapp cmenum cmlang cmlistbox cmstd cmstdseed cmtxt cs daz3d db dbCommon dbaccess dnettool dos download flex2 flex3 flex4 framemtn framereq freeWorld freesoft gimp ginpro giodownload google hdml home hta htmlDom ie9svg install java javaSwing javascript jetsql jquery jsp jspTest jspVarious lightbox listasp listmsapi listmsie listmsiis listmsnt listmspatch listmsscript listmsvb listmsvc memo ms msde mysql netbeans oraPlsql oracle oracleWiper oraclehelper orafunc other panoramio pear perl personal pgdojo pgdojo_cal pgdojo_holiday pgdojo_idx pgdojo_ref pgdojo_req php phpVarious phpguide plsql postgres ps r205 realC realwebapp regex rgaki ruby rule sboard sc scprint scquest sdb sdbquest seesaa setup sh_Imagick sh_canvas sh_dotnet sh_google sh_tool sh_web shadowbox shgm shjquery shvbs shweb sjscript skadai skywalker smalltech sperl sqlq src systemdoc tcpip tegaki three toolbox twitter typeface usb useXML vb vbdb vbsfunc vbsguide vbsrc vpc wcsignup webanymind webappgen webclass webparts webtool webwsh win8 winofsql wmi work wp youtube