(************************************************************ utf.ml Created : Fri Mar 14 23:15:14 2003 Last modified: Sat Mar 15 03:47:11 2003 Compile: ocamlc -g utf.ml -o utf # FTP Directory: sources/ocaml # ************************************************************) (** @author Takashi Masuyama *) let table_file_name = "JIS0208.TXT" (*let make_table *) let sjis_code_start = 0x8140 let sjis_code_end = 0xEAA4 let additional = 0x8080 let euc_code_start = 0x2121 + additional let euc_code_end = 0x7426 + additional let read_table_file s e input = let table = Array.make (e-s+1) 0 in let rec iter () = let line = input_line input in if line.[0] = '#' then iter () else begin Scanf.sscanf line "%i\t%i\t%i%s" (fun sjis code unicode hoge -> table.(code+additional-s) <- unicode); iter () end in try iter () with End_of_file -> table (*let read_euc_char input =*) (*let data =*) (* let tmp = "Αύ" in*) (* ((Char.code tmp.[0]) lsl 8) lor (Char.code tmp.[1])*) let pure_euc_to_utf8 code = let table = read_table_file euc_code_start euc_code_end (open_in table_file_name) in let end_pos = String.length code in let rec iter res pos = if pos >= end_pos then res else let c1code = code.[pos] in let c1 = Char.code c1code in if c1 <= 0x7F then let result = String.make 1 '\000' in begin result.[0] <- c1code; iter (res^result) (pos+1) end else let code = let c2 = Char.code code.[pos+1] in let index = (c1 lsl 8) lor c2 in table.(index - euc_code_start) in let new_result = res^(if code <= 0x7FF then let result = String.make 2 '\000' in let cl = (code land 0x3F) lor 0x80 in (* 6 bit *) let ch = ((code lsr 6) land 0x1F) lor 0xC0 in begin result.[0] <- Char.chr ch; result.[1] <- Char.chr cl; result end else if code <= 0xFFFF then let result = String.make 3 '\000' in begin result.[2] <- Char.chr((code land 0x3F) lor 0x80); (* 6 bit *) result.[1] <- Char.chr(((code lsr 6) land 0x3F) lor 0x80); result.[0] <- Char.chr(((code lsr 12) land 0xF) lor 0xE0); result; end else raise Not_found) in iter new_result (pos+2) in iter "" 0 let rec encode_from_input input = let line = input_line input in try print_endline (pure_euc_to_utf8 line); encode_from_input input with End_of_file -> () let _ = let input = open_in "../aki/aki.txt" in encode_from_input input; close_in input