First commit, version 0.1

3858b82b · Sherif · 09f91eb8 · 3858b82b · 3858b82b · 3858b82b
Commit 3858b82b authored Oct 11, 2016 by Sherif
Showing with 138 additions and 1 deletions
README.md
LICENSE.md → UNLICENSE.md
musta3rab.py
--- a/README.md
+++ b/README.md
-musta3rab
+##musta3rab
+---
+####Simple-ish, easily moddable tool to Romanize Arabic text; for example, processing "أُستاذ" into "ʼustāḏ".
+* Uses the Hans Wehr romanization system by default, but can be extended using dictionaries.
+* Licensed with [the Unlicense](http://unlicense.org/UNLICENSE), so feel free to steal it.
+###How to & Documentation
+1. Launch in Python console
+2. Use function
+		perstr("حاجة_بالعربي")
+to return the romanization of  حاجة\_بالعربي,
+Or, use
+	perstr("حاجة_بالعربي", dictionary)
+to return the romanization of حاجة\_بالعربي according to the system instructed by _dictionary_. 
+####Issues:
+* Currently depends on having the right diacritical marks to return short vowels properly, and will not detect vowels not explicitly written
+* See to-do list
+####To do:
+* Some kind of GUI
+* Integrate parts of Taha Zerrouki's [Mishkal](https://github.com/linuxscout/mishkal) to fix short vowel representation to some extent, or otherwise fix it somehow
+* Make it easier to add user dictionaries
+	* Read from file?
+* Optimize romanization of ال
\ No newline at end of file
--- a/LICENSE.md
+++ b/LICENSE.md
--- a/musta3rab.py
+++ b/musta3rab.py
+#Last updated Oct 11, 2016.
+#Last lines in function editdict contain some unfinished functionality commented out.
+#~Sherif
+wehr = { #Dictionary using the Hans-Wehr transliteration system.
+	'ا': 'ā',
+	'ب': 'b',
+	'ت': 't',
+	'ث': 'ṯ',
+	'ج': 'j',
+	'ح': 'ḥ',
+	'خ': 'ḵ',
+	'د': 'd',
+	'ذ': 'ḏ',
+	'ر': 'r',
+	'ز': 'z',
+	'س': 's',
+	'ش': 'š',
+	'ص': 'ṣ',
+	'ض': 'ḍ',
+	'ط': 'ṭ',
+	'ظ': 'ẓ',
+	'ع': 'ʻ',
+	'غ': 'ḡ',
+	'ف': 'f',
+	'ق': 'q',
+	'ك': 'k',
+	'ل': 'l',
+	'م': 'm',
+	'ن': 'n',
+	'ه': 'h',
+	'و': 'ū',
+	'ي': 'ī',
+# /alphabet> <diacritics
+	'َ': 'a',
+	'ً': 'an',
+	'ُ': 'u',
+	'ٌ': 'un',
+	'ِ': 'i',
+	'ٍ': 'in',
+	'ْ': '', #sukoon means nothing
+# /diacritics> <special chars
+	'ة': 'a',
+	'ء': 'ʼ',
+	'ئ': 'ʼ',
+	'ى': 'a',
+	'ؤ': 'uʼ',
+	'إ': 'ʼi',
+	'أ': 'ʼ',
+# /special chars> <non-arabic
+	'پ': 'p',
+	'ڤ': 'v',
+	'گ': 'g',
+	'ڭ': 'g',
+	'چ': 'ç',
+# /non-arabic>  <non-letters
+	'؟': '?',
+	'(': ')',
+	')': '(',
+	'ـ': '',
+	'،': ',',
+#	'': '',
+	}
+skip = False
+def editdict (c, ara, i, table):
+	global skip
+	#function to dynamically edit the dictionary as needed to avoid poops.
+	if ( c not in table ):
+		table[c] = c
+		#if a certain letter is not in the dict, it adds it to prevent poops.
+	if (ara[i] == 'ّ'):
+		table['ّ'] = table[ ara[i-1] ]
+		#If ّ  (shadda) is found, it's assigned to the value of the key before it.
+	if ( len(ara) > i ):
+		#functionality that depends on the "following" letter goes under this.
+		if ( ara[i] == 'َ' and ara[i+1] == 'ا' ):
+			skip = True
+		if ( ara[i] == 'ُ' and ara[i+1] == 'و' ):
+			skip = True
+		if ( ara[i] == 'ِ' and ara[i+1] == 'ي' ):
+			skip = True
+			#If a diacritic is followed by a long vowel of the same sound,
+			#the diacritic is skipped.
+	#	if ( ara[i] == 'ا' and ara[i+1] == 'ل' ):
+	#		if table == wehr:
+def perstr(ara, table = wehr):
+	global skip
+	#ara is the string to be romanized, table is the dict used (Wehr by default).
+	rom = ""
+	for i in range ( 0, len(ara) ):
+		skip = False
+		editdict(ara[i], ara, i, table)
+		if skip is True:
+			continue
+		rom += table[ ara[i] ]
+	return rom