From 98d4da250b44bbdd2261d9f13625b24ec891f304 Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Tue, 7 Apr 2026 14:08:50 -0700 Subject: [PATCH 1/5] Add Korean TN post-processing rules for particle agreement and month handling Signed-off-by: Jinwoo Bae --- .../text_normalization/ko/taggers/date.py | 12 ++++ .../ko/verbalizers/post_processing.py | 68 +++++++++++++++++++ .../text_normalization/normalize.py | 6 +- 3 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py diff --git a/nemo_text_processing/text_normalization/ko/taggers/date.py b/nemo_text_processing/text_normalization/ko/taggers/date.py index 4f2da5702..3c89e104b 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/text_normalization/ko/taggers/date.py @@ -249,6 +249,17 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynutil.insert("\"") ) + month_josa = pynini.union("에", "은", "는", "에는").optimize() + + individual_month_component_with_josa = ( + pynutil.insert('month: "') + + month_cardinal + + pynutil.delete("월") + + pynutil.insert("월") + + pynini.closure(month_josa, 0, 1) + + pynutil.insert('"') + ).optimize() + individual_day_component = ( pynutil.insert("day: \"") + cardinal_lz @@ -272,6 +283,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): day_and_weekday_component | month_and_weekday_component | individual_year_component + | individual_month_component_with_josa | individual_month_component | individual_day_component | week_component diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py new file mode 100644 index 000000000..b363040fd --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py @@ -0,0 +1,68 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, generator_main +from nemo_text_processing.utils.logging import logger + + +class PostProcessingFst: + def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, "ko_tn_post_processing.far") + + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] + logger.info(f"Post processing graph was restored from {far_file}.") + else: + self.fst = self.get_postprocess_graph() + if far_file: + generator_main(far_file, {"post_process_graph": self.fst}) + + def get_postprocess_graph(self): + delete_space = pynutil.delete(NEMO_SPACE) + + vowel_final = pynini.union( + "아", "야", "어", "여", "오", "요", "우", "유", "이", "애", "에", + "사", "오", "구" + ) + + rule_i_to_ga = pynini.cdrewrite( + delete_space + pynini.cross("이 ", "가 "), + vowel_final, + "", + NEMO_SIGMA, + ) + + rule_eun_to_neun = pynini.cdrewrite( + delete_space + pynini.cross("은 ", "는 "), + vowel_final, + "", + NEMO_SIGMA, + ) + + rule_eul_to_reul = pynini.cdrewrite( + delete_space + pynini.cross("을 ", "를 "), + vowel_final, + "", + NEMO_SIGMA, + ) + + graph = rule_i_to_ga @ rule_eun_to_neun @ rule_eul_to_reul + return graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 5e2f9ebb5..5930f49bd 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -187,7 +187,11 @@ def __init__( self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) elif lang == 'ko': from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import PostProcessingFst from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst + + if post_process: + self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -388,7 +392,7 @@ def normalize( return text output = SPACE_DUP.sub(' ', output[1:]) - if self.lang in ["en", "hi", "vi"] and hasattr(self, 'post_processor') and self.post_processor is not None: + if self.lang in ["en", "hi", "vi", "ko"] and hasattr(self, 'post_processor') and self.post_processor is not None: output = self.post_process(output) if punct_post_process: From 19619a9b1d6906318985fcbfe0b549a5102cf85a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 21:14:31 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/ko/taggers/date.py | 2 +- .../text_normalization/ko/verbalizers/post_processing.py | 7 ++----- nemo_text_processing/text_normalization/normalize.py | 8 ++++++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/date.py b/nemo_text_processing/text_normalization/ko/taggers/date.py index 3c89e104b..45943e1a3 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/date.py +++ b/nemo_text_processing/text_normalization/ko/taggers/date.py @@ -259,7 +259,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynini.closure(month_josa, 0, 1) + pynutil.insert('"') ).optimize() - + individual_day_component = ( pynutil.insert("day: \"") + cardinal_lz diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py index b363040fd..45fcb259f 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py @@ -38,10 +38,7 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): def get_postprocess_graph(self): delete_space = pynutil.delete(NEMO_SPACE) - vowel_final = pynini.union( - "아", "야", "어", "여", "오", "요", "우", "유", "이", "애", "에", - "사", "오", "구" - ) + vowel_final = pynini.union("아", "야", "어", "여", "오", "요", "우", "유", "이", "애", "에", "사", "오", "구") rule_i_to_ga = pynini.cdrewrite( delete_space + pynini.cross("이 ", "가 "), @@ -65,4 +62,4 @@ def get_postprocess_graph(self): ) graph = rule_i_to_ga @ rule_eun_to_neun @ rule_eul_to_reul - return graph.optimize() \ No newline at end of file + return graph.optimize() diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 5930f49bd..12661bd3d 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -189,7 +189,7 @@ def __init__( from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import PostProcessingFst from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst - + if post_process: self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) else: @@ -392,7 +392,11 @@ def normalize( return text output = SPACE_DUP.sub(' ', output[1:]) - if self.lang in ["en", "hi", "vi", "ko"] and hasattr(self, 'post_processor') and self.post_processor is not None: + if ( + self.lang in ["en", "hi", "vi", "ko"] + and hasattr(self, 'post_processor') + and self.post_processor is not None + ): output = self.post_process(output) if punct_post_process: From 420fb96f8b16de776796c965483926bc42e3881a Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Wed, 8 Apr 2026 00:04:09 -0700 Subject: [PATCH 3/5] Add Korean TN fraction test cases for particle agreement Signed-off-by: Jinwoo Bae --- .../test_cases_fraction.txt | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt index a183be59b..fc39fd495 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt @@ -11,4 +11,18 @@ 1과1/3~일과 삼분의 일 1과√1/4~일과 사분의 루트 일 3분의1~삼분의 일 -121분의3221~백이십일분의 삼천이백이십일 \ No newline at end of file +121분의3221~백이십일분의 삼천이백이십일 +이번 경기의 3/5이 중요하다~이번 경기의 오분의 삼 이 중요하다 +전체 구역의 4/7이 통제되었다~전체 구역의 칠분의 사가 통제되었다 +설문 응답자의 9/10 이 찬성했다~설문 응답자의 십분의 구가 찬성했다 +그 중 2/3은 성공했다~그 중 삼분의 이는 성공했다 +참가자의 5/8이 탈락했다~참가자의 팔분의 오가 탈락했다 +참가자의 6/7 이 통과했다~참가자의 칠분의 육 이 통과했다 +전체의 3/4 이 감소했다~전체의 사분의 삼 이 감소했다 +응답자의 2/5이 반대했다~응답자의 오분의 이가 반대했다 +학생의 7/9 이 합격했다~학생의 구분의 칠 이 합격했다 +전체의 1/2 이 남았다~전체의 이분의 일 이 남았다 +그 중 4/5이 성공했다~그 중 오분의 사가 성공했다 +전체의 5/6이 완료되었다~전체의 육분의 오가 완료되었다 +참가자의 3/8이 탈락했다~참가자의 팔분의 삼 이 탈락했다 +응답자의 6/10 이 동의했다~응답자의 십분의 육 이 동의했다 \ No newline at end of file From ecf34ccb8657f0e91852657750fa310d29cd128d Mon Sep 17 00:00:00 2001 From: Jinwoo Bae Date: Fri, 10 Apr 2026 12:18:56 -0700 Subject: [PATCH 4/5] Fix Korean fraction verbalization with particle-aware handling and remove post_processing dependency Signed-off-by: Jinwoo Bae --- .../text_normalization/ko/taggers/fraction.py | 22 +++- .../ko/verbalizers/fraction.py | 108 ++++++++++++++++-- .../ko/verbalizers/post_processing.py | 65 ----------- .../ko/verbalizers/verbalize_final.py | 2 +- .../text_normalization/normalize.py | 6 +- .../test_cases_fraction.txt | 6 +- 6 files changed, 124 insertions(+), 85 deletions(-) delete mode 100644 nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py index 2163f5f7f..6181e82d1 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -81,6 +81,26 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + numerator_component ) + # Optional particles following the fraction + particle_subject = ( + pynutil.insert('morphosyntactic_features: "분의_subject"') + + (pynutil.delete("이") | pynutil.delete("가")) + ) + particle_topic = ( + pynutil.insert('morphosyntactic_features: "분의_topic"') + + (pynutil.delete("은") | pynutil.delete("는")) + ) + particle_object = ( + pynutil.insert('morphosyntactic_features: "분의_object"') + + (pynutil.delete("을") | pynutil.delete("를")) + ) + + optional_particle = pynini.closure( + pynutil.insert(NEMO_SPACE) + (particle_subject | particle_topic | particle_object), + 0, + 1, + ) + # Optional minus sign optional_sign = ( pynutil.insert(f'negative: {DOUBLE_QUOTE}') @@ -90,7 +110,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) # Combine full graph - graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + graph = pynini.closure(optional_sign, 0, 1) + (graph_fraction_slash | graph_fraction_word) + optional_particle self.graph = graph.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py index bafbf133d..5886d408c 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -17,7 +17,6 @@ from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space - class FractionFst(GraphFst): """ Finite state transducer for verbalizing Korean fractions, e.g. @@ -60,7 +59,46 @@ def __init__(self, deterministic: bool = True): + numerator_component ) - # Match and delete integer_part field (e.g., "2" in "2과3분의1") + # Handle subject particle feature (분의_subject) + # Insert default particle "이" (will be corrected later via rewrite rules) + subject_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_subject"') + + delete_space + + pynutil.insert("이") # 일단 기본값 + ) + + # Handle topic particle feature (분의_topic) + topic_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_topic"') + + delete_space + + pynutil.insert("은") + ) + + # Handle object particle feature (분의_object) + object_suffix = ( + pynutil.delete(NEMO_SPACE) + + pynutil.delete('morphosyntactic_features:') + + delete_space + + pynutil.delete('"분의_object"') + + delete_space + + pynutil.insert("을") + ) + + # Combine fraction + optional particle suffix + # Particle is always inserted first in default form and later corrected + graph_fraction_all = ( + graph_fraction + + pynini.closure(subject_suffix | topic_suffix | object_suffix, 0, 1) + ) + + # Handle integer + fraction (e.g., "2과 3/4") + # integer_part is removed and replaced with proper spacing graph_integer = ( pynutil.delete('integer_part:') + delete_space @@ -69,9 +107,10 @@ def __init__(self, deterministic: bool = True): + pynutil.delete('"') + pynutil.insert(NEMO_SPACE) ) - graph_integer_fraction = graph_integer + delete_space + graph_fraction - - # Match and delete optional negative field (e.g., "마이너스") + # Combine integer part with fraction + graph_integer_fraction = graph_integer + delete_space + graph_fraction_all + + # Handle optional negative prefix (e.g., "마이너스") optional_sign = ( pynutil.delete('negative:') + delete_space @@ -82,9 +121,58 @@ def __init__(self, deterministic: bool = True): + pynutil.insert(NEMO_SPACE) ) - # Final graph handles optional negative + (integer + fraction | fraction only) - graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction) - - # Final optimized verbalizer FST + # Final structure: + # [optional negative] + (integer + fraction OR fraction only) + graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction_all) + + # Remove token wrappers final_graph = self.delete_tokens(graph) - self.fst = final_graph.optimize() + + # Sigma for rewrite context (entire string) + sigma = pynini.closure(NEMO_NOT_QUOTE | NEMO_SPACE) + + # Fix subject particle agreement (이 → 가 for vowel-ending numerals) + # e.g., 사이 → 사가, 구이 → 구가 + subject_rewrite = pynini.cdrewrite( + pynini.union( + pynini.cross("이이", "이가"), + pynini.cross("사이", "사가"), + pynini.cross("오이", "오가"), + pynini.cross("구이", "구가"), + ), + "", + "", + sigma, + ) + + # Fix topic particle agreement (은 → 는) + # e.g., 이은 → 이는, 사은 → 사는 + topic_rewrite = pynini.cdrewrite( + pynini.union( + pynini.cross("이은", "이는"), + pynini.cross("사은", "사는"), + pynini.cross("오은", "오는"), + pynini.cross("구은", "구는"), + ), + "", + "", + sigma, + ) + + # Fix object particle agreement (을 → 를) + # e.g., 오을 → 오를, 이을 → 이를 + object_rewrite = pynini.cdrewrite( + pynini.union( + pynini.cross("이을", "이를"), + pynini.cross("사을", "사를"), + pynini.cross("오을", "오를"), + pynini.cross("구을", "구를"), + ), + "", + "", + sigma, + ) + + # Apply all rewrite rules sequentially and final optimized FST + final_graph = final_graph @ subject_rewrite @ topic_rewrite @ object_rewrite + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py b/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py deleted file mode 100644 index 45fcb259f..000000000 --- a/nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import pynini -from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, generator_main -from nemo_text_processing.utils.logging import logger - - -class PostProcessingFst: - def __init__(self, cache_dir: str = None, overwrite_cache: bool = False): - far_file = None - if cache_dir is not None and cache_dir != "None": - os.makedirs(cache_dir, exist_ok=True) - far_file = os.path.join(cache_dir, "ko_tn_post_processing.far") - - if not overwrite_cache and far_file and os.path.exists(far_file): - self.fst = pynini.Far(far_file, mode="r")["post_process_graph"] - logger.info(f"Post processing graph was restored from {far_file}.") - else: - self.fst = self.get_postprocess_graph() - if far_file: - generator_main(far_file, {"post_process_graph": self.fst}) - - def get_postprocess_graph(self): - delete_space = pynutil.delete(NEMO_SPACE) - - vowel_final = pynini.union("아", "야", "어", "여", "오", "요", "우", "유", "이", "애", "에", "사", "오", "구") - - rule_i_to_ga = pynini.cdrewrite( - delete_space + pynini.cross("이 ", "가 "), - vowel_final, - "", - NEMO_SIGMA, - ) - - rule_eun_to_neun = pynini.cdrewrite( - delete_space + pynini.cross("은 ", "는 "), - vowel_final, - "", - NEMO_SIGMA, - ) - - rule_eul_to_reul = pynini.cdrewrite( - delete_space + pynini.cross("을 ", "를 "), - vowel_final, - "", - NEMO_SIGMA, - ) - - graph = rule_i_to_ga @ rule_eun_to_neun @ rule_eul_to_reul - return graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py index 0271a4b7b..1c0fabd1b 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -69,4 +69,4 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ self.fst = verbalizer.optimize() if far_file: - generator_main(far_file, {"verbalize": self.fst}) + generator_main(far_file, {"verbalize": self.fst}) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 12661bd3d..7682ef047 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -187,11 +187,7 @@ def __init__( self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) elif lang == 'ko': from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst - from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import PostProcessingFst from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst - - if post_process: - self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache) else: raise NotImplementedError(f"Language {lang} has not been supported yet.") @@ -393,7 +389,7 @@ def normalize( output = SPACE_DUP.sub(' ', output[1:]) if ( - self.lang in ["en", "hi", "vi", "ko"] + self.lang in ["en", "hi", "vi"] and hasattr(self, 'post_processor') and self.post_processor is not None ): diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt index fc39fd495..65e5049b8 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_fraction.txt @@ -12,9 +12,9 @@ 1과√1/4~일과 사분의 루트 일 3분의1~삼분의 일 121분의3221~백이십일분의 삼천이백이십일 -이번 경기의 3/5이 중요하다~이번 경기의 오분의 삼 이 중요하다 +이번 경기의 3/5이 중요하다~이번 경기의 오분의 삼이 중요하다 전체 구역의 4/7이 통제되었다~전체 구역의 칠분의 사가 통제되었다 -설문 응답자의 9/10 이 찬성했다~설문 응답자의 십분의 구가 찬성했다 +설문 응답자의 9/10이 찬성했다~설문 응답자의 십분의 구가 찬성했다 그 중 2/3은 성공했다~그 중 삼분의 이는 성공했다 참가자의 5/8이 탈락했다~참가자의 팔분의 오가 탈락했다 참가자의 6/7 이 통과했다~참가자의 칠분의 육 이 통과했다 @@ -24,5 +24,5 @@ 전체의 1/2 이 남았다~전체의 이분의 일 이 남았다 그 중 4/5이 성공했다~그 중 오분의 사가 성공했다 전체의 5/6이 완료되었다~전체의 육분의 오가 완료되었다 -참가자의 3/8이 탈락했다~참가자의 팔분의 삼 이 탈락했다 +참가자의 3/8이 탈락했다~참가자의 팔분의 삼이 탈락했다 응답자의 6/10 이 동의했다~응답자의 십분의 육 이 동의했다 \ No newline at end of file From 7acdb886c04c102814c4a6b7645a4627f0f89fbb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Apr 2026 20:02:04 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/ko/taggers/fraction.py | 17 +++++------- .../ko/verbalizers/fraction.py | 26 +++++++++---------- .../ko/verbalizers/verbalize_final.py | 2 +- .../text_normalization/normalize.py | 6 +---- 4 files changed, 21 insertions(+), 30 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py index 6181e82d1..64ea0c56e 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -82,17 +82,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): ) # Optional particles following the fraction - particle_subject = ( - pynutil.insert('morphosyntactic_features: "분의_subject"') - + (pynutil.delete("이") | pynutil.delete("가")) + particle_subject = pynutil.insert('morphosyntactic_features: "분의_subject"') + ( + pynutil.delete("이") | pynutil.delete("가") ) - particle_topic = ( - pynutil.insert('morphosyntactic_features: "분의_topic"') - + (pynutil.delete("은") | pynutil.delete("는")) + particle_topic = pynutil.insert('morphosyntactic_features: "분의_topic"') + ( + pynutil.delete("은") | pynutil.delete("는") ) - particle_object = ( - pynutil.insert('morphosyntactic_features: "분의_object"') - + (pynutil.delete("을") | pynutil.delete("를")) + particle_object = pynutil.insert('morphosyntactic_features: "분의_object"') + ( + pynutil.delete("을") | pynutil.delete("를") ) optional_particle = pynini.closure( @@ -100,7 +97,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): 0, 1, ) - + # Optional minus sign optional_sign = ( pynutil.insert(f'negative: {DOUBLE_QUOTE}') diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py index 5886d408c..84e9db160 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -17,6 +17,7 @@ from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space + class FractionFst(GraphFst): """ Finite state transducer for verbalizing Korean fractions, e.g. @@ -67,7 +68,7 @@ def __init__(self, deterministic: bool = True): + delete_space + pynutil.delete('"분의_subject"') + delete_space - + pynutil.insert("이") # 일단 기본값 + + pynutil.insert("이") # 일단 기본값 ) # Handle topic particle feature (분의_topic) @@ -89,14 +90,11 @@ def __init__(self, deterministic: bool = True): + delete_space + pynutil.insert("을") ) - + # Combine fraction + optional particle suffix # Particle is always inserted first in default form and later corrected - graph_fraction_all = ( - graph_fraction - + pynini.closure(subject_suffix | topic_suffix | object_suffix, 0, 1) - ) - + graph_fraction_all = graph_fraction + pynini.closure(subject_suffix | topic_suffix | object_suffix, 0, 1) + # Handle integer + fraction (e.g., "2과 3/4") # integer_part is removed and replaced with proper spacing graph_integer = ( @@ -109,7 +107,7 @@ def __init__(self, deterministic: bool = True): ) # Combine integer part with fraction graph_integer_fraction = graph_integer + delete_space + graph_fraction_all - + # Handle optional negative prefix (e.g., "마이너스") optional_sign = ( pynutil.delete('negative:') @@ -124,13 +122,13 @@ def __init__(self, deterministic: bool = True): # Final structure: # [optional negative] + (integer + fraction OR fraction only) graph = pynini.closure(optional_sign, 0, 1) + (graph_integer_fraction | graph_fraction_all) - + # Remove token wrappers final_graph = self.delete_tokens(graph) - + # Sigma for rewrite context (entire string) sigma = pynini.closure(NEMO_NOT_QUOTE | NEMO_SPACE) - + # Fix subject particle agreement (이 → 가 for vowel-ending numerals) # e.g., 사이 → 사가, 구이 → 구가 subject_rewrite = pynini.cdrewrite( @@ -144,7 +142,7 @@ def __init__(self, deterministic: bool = True): "", sigma, ) - + # Fix topic particle agreement (은 → 는) # e.g., 이은 → 이는, 사은 → 사는 topic_rewrite = pynini.cdrewrite( @@ -172,7 +170,7 @@ def __init__(self, deterministic: bool = True): "", sigma, ) - + # Apply all rewrite rules sequentially and final optimized FST final_graph = final_graph @ subject_rewrite @ topic_rewrite @ object_rewrite - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py index 1c0fabd1b..0271a4b7b 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -69,4 +69,4 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ self.fst = verbalizer.optimize() if far_file: - generator_main(far_file, {"verbalize": self.fst}) \ No newline at end of file + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 7682ef047..5e2f9ebb5 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -388,11 +388,7 @@ def normalize( return text output = SPACE_DUP.sub(' ', output[1:]) - if ( - self.lang in ["en", "hi", "vi"] - and hasattr(self, 'post_processor') - and self.post_processor is not None - ): + if self.lang in ["en", "hi", "vi"] and hasattr(self, 'post_processor') and self.post_processor is not None: output = self.post_process(output) if punct_post_process: