diff --git a/nemo_text_processing/text_normalization/hi/data/date/days.tsv b/nemo_text_processing/text_normalization/hi/data/date/days.tsv index 633e2aec0..7d2dc7fbb 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/days.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/days.tsv @@ -3,7 +3,7 @@ ०३ तीन ०४ चार ०५ पाँच -०६ छः +०६ छह ०७ सात ०८ आठ ०९ नौ @@ -34,7 +34,7 @@ 03 तीन 04 चार 05 पाँच -06 छः +06 छह 07 सात 08 आठ 09 नौ @@ -59,4 +59,22 @@ 28 अट्ठाईस 29 उनतीस 30 तीस -31 इकतीस \ No newline at end of file +31 इकतीस +१ एक +२ दो +३ तीन +४ चार +५ पाँच +६ छह +७ सात +८ आठ +९ नौ +1 एक +2 दो +3 तीन +4 चार +5 पाँच +6 छह +7 सात +8 आठ +9 नौ \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/months.tsv b/nemo_text_processing/text_normalization/hi/data/date/months.tsv index af770dafc..5eaafb648 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/months.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/months.tsv @@ -21,4 +21,37 @@ 09 सितंबर 10 अक्टूबर 11 नवंबर -12 दिसंबर \ No newline at end of file +12 दिसंबर +जनवरी जनवरी +फ़रवरी फ़रवरी +फरवरी फरवरी +मार्च मार्च +अप्रैल अप्रैल +अप्रील अप्रील +मई मई +जून जून +जुलाई जुलाई +अगस्त अगस्त +सितंबर सितंबर +अक्टूबर अक्टूबर +अक्तूबर अक्तूबर +नवंबर नवंबर +दिसंबर दिसंबर +१ जनवरी +२ फ़रवरी +३ मार्च +४ अप्रैल +५ मई +६ जून +७ जुलाई +८ अगस्त +९ सितंबर +1 जनवरी +2 फ़रवरी +3 मार्च +4 अप्रैल +5 मई +6 जून +7 जुलाई +8 अगस्त +9 सितंबर \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv index d4c1ca0b1..6166ec327 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -1,3 +1,4 @@ -सन् -सन -साल \ No newline at end of file +सन् +सन +साल +दशक \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv b/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv new file mode 100644 index 000000000..7fb5f5380 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/unambiguous_days.tsv @@ -0,0 +1,38 @@ +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस +13 तेरह +14 चौदह +15 पंद्रह +16 सोलह +17 सत्रह +18 अठारह +19 उन्नीस +20 बीस +21 इक्कीस +22 बाईस +23 तेईस +24 चौबीस +25 पच्चीस +26 छब्बीस +27 सत्ताईस +28 अट्ठाईस +29 उनतीस +30 तीस +31 इकतीस \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index da917f3de..42d266547 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -25,6 +25,7 @@ from nemo_text_processing.text_normalization.hi.utils import get_abs_path days = pynini.string_file(get_abs_path("data/date/days.tsv")) +unambiguous_days = pynini.string_file(get_abs_path("data/date/unambiguous_days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) @@ -33,23 +34,33 @@ teens_ties = pynini.union(teens_ties_hi, teens_ties_en) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) -# Read suffixes from file into a list +digit_as_day = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: - suffixes_list = f.read().splitlines() + suffixes_list = [line.rstrip("\n") for line in f if line.strip()] with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: - prefixes_list = f.read().splitlines() + prefixes_list = [line.rstrip("\n") for line in f if line.strip()] -# Create union of suffixes and prefixes suffix_union = pynini.union(*suffixes_list) prefix_union = pynini.union(*prefixes_list) +verbalized_hundreds = teens_ties_hi.project("output") +verbalized_unit = pynini.union(teens_ties_hi.project("output"), digit.project("output")) + +verbalized_year_sou = ( + verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) +) + class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. - "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } - "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } - + "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } + "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } + "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } + "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } + "०३-२०१०" -> date { month: "मार्च" year: "दो हज़ार दस" } + "11-2024" -> date { month: "नवंबर" year: "दो हज़ार चौबीस" } Args: cardinal: cardinal GraphFst @@ -60,6 +71,7 @@ class DateFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="date", kind="classify") + # ── Year number graphs ──────────────────────────────────────────────── graph_year_thousands = pynini.compose( (NEMO_ALL_DIGIT + NEMO_ALL_ZERO + NEMO_ALL_DIGIT + NEMO_ALL_DIGIT), cardinal.graph_thousands ) @@ -68,52 +80,147 @@ def __init__(self, cardinal: GraphFst): ) cardinal_graph = pynini.union( - digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands + digit, + teens_and_ties, + cardinal.graph_hundreds, + graph_year_thousands, + graph_year_hundreds_as_thousands, ) graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) + graph_year_era = pynini.union( + graph_year_thousands, + graph_year_hundreds_as_thousands, + cardinal.graph_hundreds, + ) + + # ── Separators ─────────────────────────────────────────────────────── delete_dash = pynutil.delete("-") delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") + delete_space = pynutil.delete(" ") + delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1) + delete_comma_sep = delete_comma + delete_optional_space + delete_numeric_sep = pynini.union(delete_dash, delete_slash) + + # ── Day graphs ─────────────────────────────────────────────────────── + # Full day graph — all days 1-31 (used in DD-MM graphs) + day_num = pynini.union( + days, + digit_as_day, + teens_and_ties, + ) - days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space - - months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + days_graph = pynutil.insert("day: \"") + day_num + pynutil.insert("\"") + insert_space - years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space + # Unambiguous day graph — only days 13-31 + # Used in MM-DD graphs so they only fire when day cannot be a month number + unambiguous_day_num = pynini.union( + unambiguous_days, + ) - graph_dd_mm = days_graph + delete_dash + months_graph + unambiguous_days_graph = pynutil.insert("day: \"") + unambiguous_day_num + pynutil.insert("\"") + insert_space - graph_mm_dd = months_graph + delete_dash + days_graph + # ── Month graph ────────────────────────────────────────────────────── + months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space - graph_mm_dd += pynutil.insert(" preserve_order: true ") + # ── Year graph ─────────────────────────────────────────────────────── + years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - # Graph for era + # ── Era graph ──────────────────────────────────────────────────────── era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + # ── Range graph (e.g. २९७-२७२ ई. पू.) ────────────────────────────── range_graph = pynini.cross("-", "से") - # Graph for year + # ── Century ordinal (e.g. २०वीं, १८वीं) ──────────────────────────── century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space - # Updated logic to use suffix_union + # ── Year + suffix (e.g. २०२० में, १९९० का) ────────────────────────── year_number = graph_year + suffix_union year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space - # Updated logic to use prefix_union - year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + # ── Year + prefix (e.g. सन् २०२४, साल २०२०) ──────────────────────── + year_prefix = pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + graph_year + pynutil.insert("\"") + + # ── Year + prefix + suffix (e.g. सन २००८ में) ─────────────────────── + year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + graph_year + + suffix_union + + pynutil.insert("\"") + ) + + # ── Verbalized year passthrough graphs ─────────────────────────────── + graph_verbalized_year_suffix = ( + pynutil.insert("era: \"") + verbalized_year_sou + suffix_union + pynutil.insert("\"") + insert_space + ) + + graph_verbalized_year_bare = ( + pynutil.insert("era: \"") + verbalized_year_sou + pynutil.insert("\"") + insert_space + ) + + graph_verbalized_year_prefix = ( + pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + verbalized_year_sou + pynutil.insert("\"") + ) + + graph_verbalized_year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + verbalized_year_sou + + suffix_union + + pynutil.insert("\"") + ) - delete_separator = pynini.union(delete_dash, delete_slash) - graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph + # ── Numeric separator date graphs ──────────────────────────────────── + # DD-MM: uses full day range (all 1-31) + graph_dd_mm = days_graph + delete_numeric_sep + months_graph - graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph + # MM-DD: only fires when day is unambiguously > 12 + # This prevents 01-10 being read as MM-DD (January 10) + graph_mm_dd = months_graph + delete_numeric_sep + unambiguous_days_graph + graph_mm_dd += pynutil.insert(" preserve_order: true ") + # DD-MM-YYYY: uses full day range + graph_dd_mm_yyyy = days_graph + delete_numeric_sep + months_graph + delete_numeric_sep + years_graph + + # MM-DD-YYYY: only fires when day is unambiguously > 12 + graph_mm_dd_yyyy = ( + months_graph + delete_numeric_sep + unambiguous_days_graph + delete_numeric_sep + years_graph + ) graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph + # ── Space-separated date graphs ────────────────────────────────────── + graph_dd_month = days_graph + delete_space + months_graph - graph_year_suffix = era_graph + graph_dd_month_comma_yyyy = days_graph + delete_space + months_graph + delete_comma_sep + years_graph + + graph_dd_month_comma_yyyy_era = ( + days_graph + delete_space + months_graph + delete_comma_sep + years_graph + era_graph + ) + + graph_month_comma_yyyy = months_graph + delete_comma_sep + years_graph + + graph_month_comma_yyyy_era = months_graph + delete_comma_sep + years_graph + era_graph + + # MM-YYYY: supports both space and dash separator + # e.g. "मार्च २००३", "०३-२०१०", "11-2024" + graph_mm_yyyy = months_graph + pynini.union(delete_space, delete_dash) + years_graph + + # ── Era-only graphs ────────────────────────────────────────────────── + graph_year_era_only = ( + pynutil.insert("era: \"") + + graph_year_era + + insert_space + + year_suffix + + pynutil.insert("\"") + + insert_space + ) graph_range = ( pynutil.insert("era: \"") @@ -126,21 +233,41 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert(" preserve_order: true ") ) - # default assume dd_mm_yyyy + graph_year_suffix = era_graph + # ── Final graph ─────────────────────────────────────────────────────── final_graph = ( - pynutil.add_weight(graph_dd_mm, -0.001) - | graph_mm_dd + # Full date with era — most specific first + pynutil.add_weight(graph_dd_month_comma_yyyy_era, -0.003) + | pynutil.add_weight(graph_month_comma_yyyy_era, -0.003) + # Full numeric dates | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy + # Full space/comma dates + | pynutil.add_weight(graph_dd_month_comma_yyyy, -0.001) + # Day + month only + | pynutil.add_weight(graph_dd_mm, -0.001) + | pynutil.add_weight(graph_dd_month, -0.001) + | graph_mm_dd + # Month + year — space or dash | pynutil.add_weight(graph_mm_yyyy, -0.2) - | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_month_comma_yyyy, -0.2) + # Era graphs + | pynutil.add_weight(graph_year_era_only, -0.005) | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(graph_year_suffix, -0.001) + # Century ordinal | pynutil.add_weight(century_text, -0.001) - | pynutil.add_weight(year_text, -0.001) + # Verbalized year passthrough — more specific first + | pynutil.add_weight(graph_verbalized_year_prefix_suffix, -0.012) + | pynutil.add_weight(graph_verbalized_year_prefix, -0.011) + | pynutil.add_weight(graph_verbalized_year_suffix, -0.010) + | pynutil.add_weight(graph_verbalized_year_bare, -0.009) + # Numeric year with suffix/prefix + | pynutil.add_weight(year_prefix_suffix, -0.010) | pynutil.add_weight(year_prefix, -0.009) + | pynutil.add_weight(year_text, -0.001) ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index cb03ebce6..5c3ee661a 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -117,7 +117,7 @@ def __init__( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) - | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.05) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 86f1f6678..2df448456 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -1,4 +1,4 @@ -06-05~छः मई +06-05~छह मई ३१-०६~इकतीस जून 02-01~दो जनवरी ०४-०१~चार जनवरी