HusseinBashir commited on
Commit
44c93e2
·
verified ·
1 Parent(s): a47d925

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -11
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  os.environ["HF_HOME"] = "/tmp"
3
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
@@ -35,6 +36,52 @@ number_words = {
35
  100: "boqol", 1000: "kun"
36
  }
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def number_to_words(number):
39
  number = int(number)
40
  if number < 20:
@@ -50,21 +97,13 @@ def number_to_words(number):
50
  return part
51
  elif number < 1000000:
52
  thousands, remainder = divmod(number, 1000)
53
- words = []
54
- if thousands == 1:
55
- words.append("kun")
56
- else:
57
- words.append(number_to_words(thousands) + " kun")
58
  if remainder:
59
  words.append("iyo " + number_to_words(remainder))
60
  return " ".join(words)
61
  elif number < 1000000000:
62
  millions, remainder = divmod(number, 1000000)
63
- words = []
64
- if millions == 1:
65
- words.append("milyan")
66
- else:
67
- words.append(number_to_words(millions) + " milyan")
68
  if remainder:
69
  words.append(number_to_words(remainder))
70
  return " ".join(words)
@@ -72,11 +111,27 @@ def number_to_words(number):
72
  return str(number)
73
 
74
  def normalize_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
76
  text = re.sub(r'\.\d+', '', text)
 
77
  def replace_num(match):
78
  return number_to_words(match.group())
79
  text = re.sub(r'\d+', replace_num, text)
 
80
  symbol_map = {
81
  '$': 'doolar',
82
  '=': 'egwal',
@@ -85,9 +140,13 @@ def normalize_text(text):
85
  }
86
  for sym, word in symbol_map.items():
87
  text = text.replace(sym, ' ' + word + ' ')
 
88
  text = text.replace("KH", "qa").replace("Z", "S")
89
  text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
90
- text = text.replace("ZamZam", "SamSam")
 
 
 
91
  return text
92
 
93
  def waveform_to_wav_bytes(waveform: torch.Tensor, sample_rate: int = 22050) -> bytes:
 
1
+ # Environment settings
2
  import os
3
  os.environ["HF_HOME"] = "/tmp"
4
  os.environ["TRANSFORMERS_CACHE"] = "/tmp"
 
36
  100: "boqol", 1000: "kun"
37
  }
38
 
39
+ shortcut_map = {
40
+ "asc": "asalaamu caleykum",
41
+ "wcs": "wacaleykum salaam",
42
+ "fcn": "fiican",
43
+ "xld": "xaaladda ka waran",
44
+ "kwrn": "kawaran",
45
+ "scw": "salalaahu caleyhi wa salam",
46
+ "alx": "alxamdu lilaahi",
47
+ "m.a": "maasha allah",
48
+ "sthy": "side tahey",
49
+ "sxp": "saaxiib"
50
+ }
51
+
52
+ country_map = {
53
+ "somalia": "Soomaaliya",
54
+ "ethiopia": "Itoobiya",
55
+ "kenya": "Kenya",
56
+ "djibouti": "Jabuuti",
57
+ "sudan": "Suudaan",
58
+ "Yeman": "yemaan",
59
+ "uganda": "Ugaandha",
60
+ "tanzania": "Tansaaniya",
61
+ "egypt": "Masar",
62
+ "libya": "Liibiya",
63
+ "algeria": "Aljeeriya",
64
+ "morocco": "Morooko",
65
+ "tunisia": "Tuniisiya",
66
+ "eritrea": "Eriteriya",
67
+ "malawi": "Malaawi",
68
+ "English": "ingiriis",
69
+ "Spain": "isbeen",
70
+ "Brazil": "baraasiil",
71
+ "niger": "Niyjer",
72
+ "Italy": "itaaliya",
73
+ "united states": "Maraykanka",
74
+ "china": "Shiinaha",
75
+ "india": "Hindiya",
76
+ "russia": "Ruushka",
77
+ "Saudi Arabia": "Sucuudi Carabiya",
78
+ "germany": "Jarmalka",
79
+ "france": "Faransiiska",
80
+ "japan": "Jabaan",
81
+ "canada": "Kanada",
82
+ "australia": "Australia"
83
+ }
84
+
85
  def number_to_words(number):
86
  number = int(number)
87
  if number < 20:
 
97
  return part
98
  elif number < 1000000:
99
  thousands, remainder = divmod(number, 1000)
100
+ words = [number_to_words(thousands) + " kun" if thousands > 1 else "kun"]
 
 
 
 
101
  if remainder:
102
  words.append("iyo " + number_to_words(remainder))
103
  return " ".join(words)
104
  elif number < 1000000000:
105
  millions, remainder = divmod(number, 1000000)
106
+ words = [number_to_words(millions) + " milyan" if millions > 1 else "milyan"]
 
 
 
 
107
  if remainder:
108
  words.append(number_to_words(remainder))
109
  return " ".join(words)
 
111
  return str(number)
112
 
113
  def normalize_text(text):
114
+ text = re.sub(r'(?i)(?<!\w)zamzam(?!\w)', 'samsam', text)
115
+
116
+ def replace_shortcuts(match):
117
+ word = match.group(0).lower()
118
+ return shortcut_map.get(word, word)
119
+ pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in shortcut_map.keys()) + r')\b', re.IGNORECASE)
120
+ text = pattern.sub(replace_shortcuts, text)
121
+
122
+ def replace_countries(match):
123
+ word = match.group(0).lower()
124
+ return country_map.get(word, word)
125
+ country_pattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in country_map.keys()) + r')\b', re.IGNORECASE)
126
+ text = country_pattern.sub(replace_countries, text)
127
+
128
  text = re.sub(r'(\d{1,3})(,\d{3})+', lambda m: m.group(0).replace(",", ""), text)
129
  text = re.sub(r'\.\d+', '', text)
130
+
131
  def replace_num(match):
132
  return number_to_words(match.group())
133
  text = re.sub(r'\d+', replace_num, text)
134
+
135
  symbol_map = {
136
  '$': 'doolar',
137
  '=': 'egwal',
 
140
  }
141
  for sym, word in symbol_map.items():
142
  text = text.replace(sym, ' ' + word + ' ')
143
+
144
  text = text.replace("KH", "qa").replace("Z", "S")
145
  text = text.replace("SH", "SHa'a").replace("DH", "Dha'a")
146
+
147
+ if re.search(r'(?i)(zamzam|samsam)[\s\.,!?]*$', text.strip()):
148
+ text += " m"
149
+
150
  return text
151
 
152
  def waveform_to_wav_bytes(waveform: torch.Tensor, sample_rate: int = 22050) -> bytes: