Back

buonaparte's audio and text links

buonaparte Wrote:I need a tool that would be able to convert Japanese texts with kanji to spaced hiragana, something similar to this http://nihongo.j-talk.com/kanji/, but able to handle large files - novels.
In case you (or somebody else) are still looking for something like that:

Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Convert Japanese text to spaced hiragana.
#
# This tool converts Japanese text (encoded as utf-8) stored in file 'file.in'
# to spaced hiragana text stored in 'file.out' (also utf-8 encoded).
#
# For the conversion, the tool requires MeCab to be present in:
# 'C:\Program Files (x86)\MeCab\bin\mecab.exe'
#
# Usage:
# $ python convert-to-spaced-hiragana.py

import subprocess
import re
import codecs

fn_tmp_in = '.mc-in'
fn_tmp_out = '.mc-out'
command = [
    r'C:\Program Files (x86)\MeCab\bin\mecab.exe',
    fn_tmp_in,
    '-o', fn_tmp_out
]
def mecab(line):
    file = codecs.open(fn_tmp_in, 'wb', 'utf-8')
    file.write(line)
    file.close()

    subprocess.call(command, shell = True)

    file = codecs.open(fn_tmp_out, encoding='utf-8')
    mout = file.read()
    file.close()
    return mout

hira = \
u'がぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽあいうえおかきくけこさしすせそたちつてと' + \
u'なにぬねのはひふへほまみむめもやゆよらりるれろわをんぁぃぅぇぉゃゅょっ'
kata = \
u'ガギグゲゴザジズゼゾダヂヅデドバビブベボパピプペポアイウエオカキクケコサシスセソタチツテト' + \
u'ナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲンァィゥェォャュョッ'
pattern_a = re.compile(r'^(.*\t).*?,.*?,.*?,.*?,.*?,.*?,.*?,(.*?),.*')
pattern_b = re.compile(r'^(.*)\t.*?,.*')
def parse_line(line):

    mout = mecab(line)
    mlines = mout.split('\n')

    sout = u''
    for ml in mlines:
        if pattern_a.match(ml):
            p = pattern_a.sub(r'\1\2', ml)
            pair = p.split('\t')
            if pair[0] == pair[1]:
                sout += pair[0]
            else:
                pair_1 = u''
                for ch in pair[1]:
                    if ch in kata:
                        pair_1 += hira[kata.index(ch)]
                
                has_kanji = False
                for ch in pair[0]:
                    if not ch in hira:
                        has_kanji = True
                        break
                
                if has_kanji and len(sout) > 0:
                    sout += u' '
                sout += pair_1
                if has_kanji:
                    sout += u' '

        elif pattern_b.match(ml):
            p = pattern_b.sub(r'\1', ml)
            sout += p
    return sout

#print u'建物のいちばん上にある一対の鳳凰の像も修理のために降ろしました。'.encode('utf-8')
#print parse_line(u'建物のいちばん上にある一対の鳳凰の像も修理のために降ろしました。').encode('utf-8')
#exit(0)

file_in = codecs.open('file.in', encoding='utf-8')
file_out = codecs.open('file.out', 'wb', 'utf-8')
for line in file_in:
    lo = parse_line(line) + u'\n'
    file_out.write(lo)
file_out.close()
file_in.close()
Edited: 2014-11-21, 6:10 am
Reply

Messages In This Thread