(************************************************************ general_trie.ml Created : Sat Dec 21 22:24:27 2002 Last modified: Sat Dec 21 23:19:12 2002 Compile: ocamlopt.opt unix.cmxa general_trie.ml -o general_trie # FTP Directory: sources/ocaml # ************************************************************) (** @author Takashi Masuyama つくれるかな? 2002/12/18 できた。 treat_type と組み合わせて 型名、型コンストラクタ名を mliファイルに対してのみ trie に入れることができた。 性能評価をやってみたい。 始めの数段の配列化。何段やるといいか? (メモリ vs 効率 あとキャッシュ!?) 2002/12/17 普通にラベルつきの木。 問題 tak takashi のように prefix となっているものの表現はどうする? そもそも文字列が存在することを表すものは? ノードが文字列を表現するので、ノードのラベルとしてそこで終了する文字列があるかを 書く (bool) @author Takashi Masuyama *) open Printf type array_part = ArrayPart of int * bool * ((array_part option) array) | TreePart of tree_part and tree_part = T of char * bool * (tree_part list) (* rose とかいうらしい↑ *) ;; let depth_of_array_part = 3 ;; let array_size = 26 ;; let init_array_part () = ArrayPart(1, false, Array.make array_size None) ;; let empty = T('=', false, []) ;; let depth = function ArrayPart (d, _, _) -> d | _ -> assert false ;; (* 探しているtree partを取り出してくる。削除したリストも返す *) let remove_tree_part_from_list ch lst = (* prev 部分が逆順に!! *) let rec iter remain prev = match remain with [] -> (None, prev) | (T(ch', _, _) as r)::tl -> if ch = ch' then (Some r, List.append prev tl) else iter tl (r::prev) in iter lst [] ;; let print_tree_part t = let rec iter space (T(ch, b, lst)) = printf "%s%c (%b)\n" space ch b; List.iter (fun t -> iter (space^" ") t) lst in iter "" t ;; (* top level *) let rec add_to_tree_part t str from_pos = let len = String.length str in let rec iter (T(ch, b, lst) as t) pos = if len = pos then T(ch, true, lst) else let ch' = String.get str pos in let (t, lst') = remove_tree_part_from_list ch' lst in let new_head = iter (match t with Some t -> t | None -> (T(ch', b, []))) (pos+1) in T(ch, b, new_head::lst') in iter t from_pos ;; (* 全部訪問してtrueならばリストに単語追加 *) let find_all_words t = let rec iter prefix (T(ch,b,lst) as t) result = let new_prefix = prefix^(String.make 1 ch) in let new_result = if b then (prefix^(String.make 1 ch))::result else result in List.fold_right (iter new_prefix) lst new_result in iter "" t [] ;; (* なかったら Not_found を投げる (実際は through) *) let give_one_char ch (T(_, _, lst)) = List.find (fun (T(ch', _, _)) -> ch = ch') lst ;; let make_trie_from_plain_text filename = let keyword_list = [","; "."; "("; ")"; "{"; "}"; "["; "]"; ";"; "\""; "'"; "`"; " "] in let lexer = Genlex.make_lexer keyword_list (Stream.of_channel (open_in filename)) in let rec iter result = try match Stream.next lexer with Genlex.String s -> iter (s::result) | _ -> iter result with Stream.Failure -> result in List.fold_left (fun t x -> add_to_tree_part t x 0) empty (iter []) ;; let rec read_loop top = let rec sub_loop (T(_, _, l) as t) str = let buf = String.create 1 in let num = Unix.read Unix.stdin buf 0 1 in if buf = "\n" || buf = "\r" then List.iter (fun x -> print_endline ("\t++ "^x)) (List.fold_right (fun t rl -> List.append (find_all_words t) rl) l []) else let ch = buf.[0] in try let t' = give_one_char ch t in let current_prefix = str ^ (String.make 1 ch) in print_endline current_prefix; List.iter (fun x -> print_endline ("\t"^x)) (find_all_words t'); flush stdout; sub_loop t' current_prefix; with Not_found -> () in sub_loop top ""; read_loop top ;; let main () = (* 一文字ずつ取得で *) let configure_term () = let terminal_io = Unix.tcgetattr Unix.stdin in terminal_io.Unix.c_echo <- false; terminal_io.Unix.c_icanon <- false; Unix.tcsetattr Unix.stdin Unix.TCSANOW terminal_io in let print_usage () = Printf.printf "usage: %s [text_file]\n" Sys.argv.(0) in if Array.length Sys.argv != 2 then begin print_usage (); exit 1 end else let filename = Sys.argv.(1) in let t = make_trie_from_plain_text filename in configure_term (); read_loop t ;; let _ = main ()