Skip to content

API Reference

This reference is automatically generated from the source code docstrings.

Arudh Processor

pyarud.processor.ArudhProcessor

The main engine for Arabic prosody analysis.

This class handles: 1. Converting Arabic text to Arudi writing (phonetic representation). 2. Converting Arudi text to binary patterns (1s and 0s). 3. Detecting the poetic meter (Bahr) from a list of meters. 4. Performing granular, foot-by-foot analysis to identify defects (Zihaf/Ellah).

Source code in pyarud/processor.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
class ArudhProcessor:
    """
    The main engine for Arabic prosody analysis.

    This class handles:
    1. Converting Arabic text to Arudi writing (phonetic representation).
    2. Converting Arudi text to binary patterns (1s and 0s).
    3. Detecting the poetic meter (Bahr) from a list of meters.
    4. Performing granular, foot-by-foot analysis to identify defects (Zihaf/Ellah).
    """
    def __init__(self):
        self.converter = ArudiConverter()
        self.meter_classes = get_all_meters()
        self.precomputed_patterns = {}
        self._precompute_patterns()

    def _precompute_patterns(self):
        """
        Generates structured valid patterns for each meter using the detailed_patterns engine.
        """
        for name, bahr_cls in self.meter_classes.items():
            bahr_instance = bahr_cls()
            # detailed_patterns returns {'sadr': [...], 'ajuz': [...], 'pairs': set()}
            self.precomputed_patterns[name] = bahr_instance.detailed_patterns

    def _get_similarity(self, a, b):
        # Use cubic scaling to penalize small mismatches more heavily.
        # A 0.95 raw ratio becomes ~0.73, increasing separation significantly.
        return math.pow(SequenceMatcher(None, a, b).ratio(), 6)

    def process_poem(self, verses, meter_name=None):
        """
        Analyzes a list of verses to detect the meter and evaluate prosodic correctness.

        Args:
            verses (list[tuple[str, str]]): A list of tuples, where each tuple contains
                the Sadr (first hemistich) and Ajuz (second hemistich) of a verse.
            meter_name (str, optional): The name of a specific meter to force the analysis against.
                If provided, auto-detection is skipped. Defaults to None.

        Returns:
            dict: A dictionary containing:
                - `meter` (str): The name of the detected or forced meter.
                - `verses` (list[dict]): A list of analysis results for each verse, including:
                    - `score` (float): Compatibility score (0.0 - 1.0).
                    - `sadr_analysis` (list[dict]): Detailed foot-by-foot analysis of the Sadr.
                    - `ajuz_analysis` (list[dict]): Detailed foot-by-foot analysis of the Ajuz.
        """
        detected_counts = Counter()
        temp_results = []

        # 1. Detect Meter for each verse (if not forced)
        for i, (sadr, ajuz) in enumerate(verses):
            # Convert text to pattern
            # Generate candidates for Sadr: Saturated (Standard) and Unsaturated (Mudawwar/Fragment)
            sadr_res_sat = self.converter.prepare_text(sadr, saturate=True)
            sadr_res_unsat = self.converter.prepare_text(sadr, saturate=False)

            # Generate candidates for Ajuz: Saturated (Mutlaq) and Unsaturated (Muqayyad)
            ajuz_res_sat = self.converter.prepare_text(ajuz, saturate=True)
            ajuz_res_unsat = self.converter.prepare_text(ajuz, saturate=False, muqayyad=True)

            # Handle single shatr input if needed (future proofing)
            if not ajuz:
                ajuz_res_sat = ("", "")
                ajuz_res_unsat = ("", "")

            match_info = None

            # Collect candidates: [(arudi_text, pattern), ...]
            sadr_candidates = [sadr_res_sat]
            if sadr_res_unsat[1] != sadr_res_sat[1]:
                sadr_candidates.append(sadr_res_unsat)

            ajuz_candidates = [ajuz_res_sat]
            if ajuz_res_unsat[1] != ajuz_res_sat[1]:
                ajuz_candidates.append(ajuz_res_unsat)

            # Find best meter (or best fit for forced meter)
            candidates = self._find_best_meter(sadr_candidates, ajuz_candidates, target_meter=meter_name)
            if candidates:
                best_match = candidates[0]
                detected_counts[best_match["meter"]] += 1
                match_info = best_match

            # Determine which candidates won
            chosen_sadr = sadr_candidates[0] # Default
            chosen_ajuz = ajuz_candidates[0] # Default

            if match_info:
                if "sadr_input_pattern" in match_info:
                     for cand in sadr_candidates:
                         if cand[1] == match_info["sadr_input_pattern"]:
                             chosen_sadr = cand
                             break
                if "ajuz_input_pattern" in match_info:
                     for cand in ajuz_candidates:
                         if cand[1] == match_info["ajuz_input_pattern"]:
                             chosen_ajuz = cand
                             break

            temp_results.append(
                {
                    "index": i,
                    "sadr": {"text": sadr, "pattern": chosen_sadr[1], "arudi": chosen_sadr[0]},
                    "ajuz": {"text": ajuz, "pattern": chosen_ajuz[1], "arudi": chosen_ajuz[0]},
                    "match": match_info,
                }
            )

        if meter_name:
            global_meter = meter_name
        elif detected_counts:
            global_meter = detected_counts.most_common(1)[0][0]
        else:
            return {"error": "Could not detect any valid meter."}

        # 2. Analyze against Global Meter
        final_analysis = []
        for res in temp_results:
            analysis = self._analyze_verse(res, global_meter)
            final_analysis.append(analysis)

        return {"meter": global_meter, "verses": final_analysis}

    def _find_best_meter(self, sadr_candidates, ajuz_candidates, target_meter=None):
        METER_PRIORITY = {
            "rajaz": 20,
            "kamel": 10,
            "hazaj": 20,
            "wafer": 10,
            "saree": 20,
            "munsareh": 10,
            "baseet": 10,
            "ramal": 15,
            "mutadarak": 15,
            "mutakareb": 15,
        }

        candidates = []

        meters_to_check = self.precomputed_patterns.items()
        if target_meter:
            if target_meter in self.precomputed_patterns:
                meters_to_check = [(target_meter, self.precomputed_patterns[target_meter])]
            else:
                return []

        for name, patterns in meters_to_check:
            # 1. Score Sadr candidates and pick best for this meter
            best_sadr = None
            best_sadr_score = -1
            best_sadr_input = ""

            for cand in sadr_candidates:
                cand_pat = cand[1]
                match = self._find_best_component_match(cand_pat, patterns["sadr"])
                if match["score"] > best_sadr_score:
                    best_sadr_score = match["score"]
                    best_sadr = match
                    best_sadr_input = cand_pat

            # 2. Score Ajuz candidates (if exists)
            best_ajuz = None
            best_ajuz_score = -1
            best_ajuz_input = ""

            has_ajuz = any(c[1] for c in ajuz_candidates)

            if has_ajuz:
                for cand in ajuz_candidates:
                    cand_pat = cand[1]
                    if not cand_pat:
                        continue
                    match = self._find_best_component_match(cand_pat, patterns["ajuz"])
                    if match["score"] > best_ajuz_score:
                        best_ajuz_score = match["score"]
                        best_ajuz = match
                        best_ajuz_input = cand_pat

            # 3. Calculate Combined Score
            s_score = best_sadr_score
            a_score = best_ajuz_score if best_ajuz else 0

            # Compatibility Check
            is_valid_pair = False
            if best_sadr and best_sadr["ref"] and (not has_ajuz or (best_ajuz and best_ajuz["ref"])):
                s_pat = best_sadr["ref"]["pattern"]
                a_pat = best_ajuz["ref"]["pattern"] if best_ajuz else ""
                if (s_pat, a_pat) in patterns["pairs"]:
                    is_valid_pair = True

            if has_ajuz:
                total_score = (s_score + a_score) / 2
            else:
                total_score = s_score

            candidates.append({
                "meter": name,
                "score": total_score,
                "sadr_match": best_sadr,
                "ajuz_match": best_ajuz,
                "valid_pair": is_valid_pair,
                "sadr_input_pattern": best_sadr_input,
                "ajuz_input_pattern": best_ajuz_input
            })

        # Sort candidates
        candidates.sort(key=lambda x: (
            round(x["score"], 3),
            x["valid_pair"],
            METER_PRIORITY.get(x["meter"], 0)
        ), reverse=True)

        if not candidates:
            return []

        return candidates

    def _find_best_component_match(self, input_pattern, component_patterns):
        best_score = -1
        best_ref = None

        for item in component_patterns:
            ref_pat = item["pattern"]
            score = self._get_similarity(ref_pat, input_pattern)
            if score > best_score:
                best_score = score
                best_ref = item

        return {"score": best_score, "ref": best_ref}

    def _analyze_verse(self, res, meter_name):
        # Re-run match against specific meter to get details
        patterns = self.precomputed_patterns.get(meter_name)
        if not patterns:
            return {"error": "Meter data not found"}

        sadr_match = self._find_best_component_match(res["sadr"]["pattern"], patterns["sadr"])
        ajuz_match = None
        if res["ajuz"]["pattern"]:
            ajuz_match = self._find_best_component_match(res["ajuz"]["pattern"], patterns["ajuz"])

        # Get allowed feet for this meter for greedy analysis
        bahr_cls = self.meter_classes.get(meter_name)
        allowed_sadr = []
        allowed_ajuz = []
        if bahr_cls:
            inst = bahr_cls()
            allowed_sadr = inst.get_allowed_feet_patterns(0)
            allowed_ajuz = inst.get_allowed_feet_patterns(1)

        # Analyze Sadr Feet
        sadr_analysis = self._analyze_feet(res["sadr"]["pattern"], allowed_sadr, sadr_match["ref"])

        ajuz_analysis = None
        if res["ajuz"]["pattern"]:
            ajuz_analysis = self._analyze_feet(res["ajuz"]["pattern"], allowed_ajuz, ajuz_match["ref"])

        return {
            "verse_index": res["index"],
            "sadr_text": res["sadr"]["text"],
            "ajuz_text": res["ajuz"]["text"],
            "input_pattern": res["sadr"]["pattern"] + res["ajuz"]["pattern"],
            "best_ref_pattern": (sadr_match["ref"]["pattern"] if sadr_match["ref"] else "") + 
                                (ajuz_match["ref"]["pattern"] if ajuz_match and ajuz_match["ref"] else ""),
            "score": round(
                (sadr_match["score"] + (ajuz_match["score"] if ajuz_match else 0)) / (2 if ajuz_match else 1), 2
            ),
            "sadr_analysis": sadr_analysis,
            "ajuz_analysis": ajuz_analysis
        }

    def _analyze_feet(self, input_pattern, allowed_feet_list, best_ref):
        """
        Maps input bits to feet using greedy matching against ALLOWED forms.
        This prevents one broken foot from misaligning the rest if they are valid.
        """
        analysis = []
        current_idx = 0

        # Fallback to best_ref feet if allowed_feet_list is not provided (should not happen)
        ref_feet_backup = best_ref["feet"] if best_ref else []

        # Determine number of feet to analyze
        num_feet = len(allowed_feet_list) if allowed_feet_list else len(ref_feet_backup)

        for i in range(num_feet):
            # 1. Get valid candidates for this foot position
            if allowed_feet_list:
                candidates = allowed_feet_list[i]
            elif i < len(ref_feet_backup):
                candidates = [ref_feet_backup[i]]
            else:
                candidates = []

            # Sort candidates by length descending to try longest match first
            candidates = sorted(candidates, key=len, reverse=True)

            best_local_match = None
            best_local_score = -1

            # Try to find best fit at current_idx
            # We look ahead by len(cand)
            for cand in candidates:
                cand_len = len(cand)
                # Get segment of equal length (or truncated if at end)
                segment = input_pattern[current_idx : current_idx + cand_len]

                if not segment:
                    break  # No more input

                score = self._get_similarity(cand, segment)

                # Boost score if lengths match (to prefer aligning valid feet)
                if len(segment) == cand_len:
                    if score == 1.0:
                        # Found perfect match, take it immediately
                        best_local_match = cand
                        best_local_score = 1.0
                        break

                if score > best_local_score:
                    best_local_score = score
                    best_local_match = cand
                    # Consume what we compared against

            # If no candidates (e.g., error in definitions), break
            if not best_local_match and candidates:
                best_local_match = candidates[0] # Default to first/longest

            # If we still didn't find anything (e.g. input exhausted), skip
            if not best_local_match:
                analysis.append({
                    "foot_index": i,
                    "expected_pattern": candidates[0] if candidates else "?",
                    "actual_segment": "MISSING",
                    "score": 0.0,
                    "status": "missing"
                })
                continue

            # Extract the segment we decided to consume
            # Logic: If score is low, we should consume the length of the EXPECTED pattern 
            # to keep alignment for next feet? Or length of actual?
            # If we assume the user *tried* to write the pattern, we consume Pattern Length.

            consume_len = len(best_local_match)
            # Clamp to input length
            end_idx = min(current_idx + consume_len, len(input_pattern))
            actual_segment = input_pattern[current_idx : end_idx]

            # Recalculate score on the final decided segment
            final_score = self._get_similarity(best_local_match, actual_segment)

            status = "ok" if final_score == 1.0 else "broken"
            if not actual_segment:
                status = "missing"

            analysis.append({
                "foot_index": i,
                "expected_pattern": best_local_match,
                "actual_segment": actual_segment,
                "score": round(final_score, 2),
                "status": status
            })

            current_idx = end_idx

        # Check for extra bits
        if current_idx < len(input_pattern):
            extra = input_pattern[current_idx:]
            analysis.append({
                "foot_index": num_feet,
                "expected_pattern": "",
                "actual_segment": extra,
                "score": 0,
                "status": "extra_bits"
            })

        return analysis

process_poem(verses, meter_name=None)

Analyzes a list of verses to detect the meter and evaluate prosodic correctness.

Parameters:

Name Type Description Default
verses list[tuple[str, str]]

A list of tuples, where each tuple contains the Sadr (first hemistich) and Ajuz (second hemistich) of a verse.

required
meter_name str

The name of a specific meter to force the analysis against. If provided, auto-detection is skipped. Defaults to None.

None

Returns:

Name Type Description
dict

A dictionary containing: - meter (str): The name of the detected or forced meter. - verses (list[dict]): A list of analysis results for each verse, including: - score (float): Compatibility score (0.0 - 1.0). - sadr_analysis (list[dict]): Detailed foot-by-foot analysis of the Sadr. - ajuz_analysis (list[dict]): Detailed foot-by-foot analysis of the Ajuz.

Source code in pyarud/processor.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def process_poem(self, verses, meter_name=None):
    """
    Analyzes a list of verses to detect the meter and evaluate prosodic correctness.

    Args:
        verses (list[tuple[str, str]]): A list of tuples, where each tuple contains
            the Sadr (first hemistich) and Ajuz (second hemistich) of a verse.
        meter_name (str, optional): The name of a specific meter to force the analysis against.
            If provided, auto-detection is skipped. Defaults to None.

    Returns:
        dict: A dictionary containing:
            - `meter` (str): The name of the detected or forced meter.
            - `verses` (list[dict]): A list of analysis results for each verse, including:
                - `score` (float): Compatibility score (0.0 - 1.0).
                - `sadr_analysis` (list[dict]): Detailed foot-by-foot analysis of the Sadr.
                - `ajuz_analysis` (list[dict]): Detailed foot-by-foot analysis of the Ajuz.
    """
    detected_counts = Counter()
    temp_results = []

    # 1. Detect Meter for each verse (if not forced)
    for i, (sadr, ajuz) in enumerate(verses):
        # Convert text to pattern
        # Generate candidates for Sadr: Saturated (Standard) and Unsaturated (Mudawwar/Fragment)
        sadr_res_sat = self.converter.prepare_text(sadr, saturate=True)
        sadr_res_unsat = self.converter.prepare_text(sadr, saturate=False)

        # Generate candidates for Ajuz: Saturated (Mutlaq) and Unsaturated (Muqayyad)
        ajuz_res_sat = self.converter.prepare_text(ajuz, saturate=True)
        ajuz_res_unsat = self.converter.prepare_text(ajuz, saturate=False, muqayyad=True)

        # Handle single shatr input if needed (future proofing)
        if not ajuz:
            ajuz_res_sat = ("", "")
            ajuz_res_unsat = ("", "")

        match_info = None

        # Collect candidates: [(arudi_text, pattern), ...]
        sadr_candidates = [sadr_res_sat]
        if sadr_res_unsat[1] != sadr_res_sat[1]:
            sadr_candidates.append(sadr_res_unsat)

        ajuz_candidates = [ajuz_res_sat]
        if ajuz_res_unsat[1] != ajuz_res_sat[1]:
            ajuz_candidates.append(ajuz_res_unsat)

        # Find best meter (or best fit for forced meter)
        candidates = self._find_best_meter(sadr_candidates, ajuz_candidates, target_meter=meter_name)
        if candidates:
            best_match = candidates[0]
            detected_counts[best_match["meter"]] += 1
            match_info = best_match

        # Determine which candidates won
        chosen_sadr = sadr_candidates[0] # Default
        chosen_ajuz = ajuz_candidates[0] # Default

        if match_info:
            if "sadr_input_pattern" in match_info:
                 for cand in sadr_candidates:
                     if cand[1] == match_info["sadr_input_pattern"]:
                         chosen_sadr = cand
                         break
            if "ajuz_input_pattern" in match_info:
                 for cand in ajuz_candidates:
                     if cand[1] == match_info["ajuz_input_pattern"]:
                         chosen_ajuz = cand
                         break

        temp_results.append(
            {
                "index": i,
                "sadr": {"text": sadr, "pattern": chosen_sadr[1], "arudi": chosen_sadr[0]},
                "ajuz": {"text": ajuz, "pattern": chosen_ajuz[1], "arudi": chosen_ajuz[0]},
                "match": match_info,
            }
        )

    if meter_name:
        global_meter = meter_name
    elif detected_counts:
        global_meter = detected_counts.most_common(1)[0][0]
    else:
        return {"error": "Could not detect any valid meter."}

    # 2. Analyze against Global Meter
    final_analysis = []
    for res in temp_results:
        analysis = self._analyze_verse(res, global_meter)
        final_analysis.append(analysis)

    return {"meter": global_meter, "verses": final_analysis}

Bahr (Meters)

pyarud.bahr.Bahr

Base class for defining poetic meters (Buhur).

Subclasses define the standard feet (tafeelat), valid Arudh/Dharb combinations, and disallowed variations (Zihaf) for specific positions.

Source code in pyarud/bahr.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
class Bahr:
    """
    Base class for defining poetic meters (Buhur).

    Subclasses define the standard feet (tafeelat), valid Arudh/Dharb combinations,
    and disallowed variations (Zihaf) for specific positions.
    """
    tafeelat: tuple[type[Tafeela], ...] = ()
    arod_dharbs_map: dict[type[BaseEllahZehaf], tuple[type[BaseEllahZehaf], ...]] | set[type[BaseEllahZehaf]] = {}
    sub_bahrs: tuple[type["Bahr"], ...] = ()
    only_one_shatr = False
    disallowed_zehafs_for_hashw: dict[int, tuple[list[type[BaseEllahZehaf]], ...]] = {}

    @property
    def last_tafeela(self):
        return self.tafeelat[-1]()

    def get_shatr_hashw_combinations(self, shatr_index=0):
        combinations = []
        # Hashw is everything except the last tafeela (Arudh/Dharb)
        for i, tafeela_class in enumerate(self.tafeelat[:-1]):
            tafeela = tafeela_class()
            forms = tafeela.all_zehaf_tafeela_forms()

            # Filter disallowed zehafs
            if shatr_index in self.disallowed_zehafs_for_hashw:
                disallowed = self.disallowed_zehafs_for_hashw[shatr_index]
                if i < len(disallowed):
                    forms = [f for f in forms if f.applied_ella_zehaf_class not in disallowed[i]]

            combinations.append(forms)
        return combinations

    def get_allowed_feet_patterns(self, shatr_index=0):
        """
        Returns a list of lists, where index i contains all valid binary strings for foot i.
        Used for granular analysis to align input to valid feet.
        """
        allowed_per_index = []

        # Hashw feet
        hashw_combs = self.get_shatr_hashw_combinations(shatr_index)
        for _, forms in enumerate(hashw_combs):
            allowed_per_index.append([str(f) for f in forms])

        # Last foot (Arudh/Dharb)
        last_feet = set()
        if self.only_one_shatr:
            # Treat endings as Arudh
            if isinstance(self.arod_dharbs_map, set):
                for z_cls in self.arod_dharbs_map:
                    try:
                        last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                    except AssertionError:
                        continue
            else:
                for z_cls in self.arod_dharbs_map:
                    try:
                        last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                    except AssertionError:
                        continue
        else:
            if shatr_index == 0:  # Sadr -> Arudh
                for z_cls in self.arod_dharbs_map.keys():
                    try:
                        last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                    except AssertionError:
                        continue
            else:  # Ajuz -> Dharb
                for d_list in self.arod_dharbs_map.values():
                    for z_cls in d_list:
                        try:
                            last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                        except AssertionError:
                            continue

        allowed_per_index.append(list(last_feet))
        return allowed_per_index

    @property
    def detailed_patterns(self):
        """
        Returns structured patterns for Sadr and Ajuz separately.
        """
        patterns = {
            "sadr": [],
            "ajuz": [],
            "pairs": set() # Set of (sadr_pattern_str, ajuz_pattern_str) for validation
        }

        if self.only_one_shatr:
             # Single shatr meters (Mashtoor/Manhook)
             # We treat them as Sadr only
             hashw = self.get_shatr_hashw_combinations()

             # For single shatr, the "Arudh" is the end of the line

             # Collect all allowed endings from the map
             # In single shatr, arod_dharbs_map is a set or dict. 
             # If dict, keys are allowed endings? Or values?
             # Looking at subclasses: arod_dharbs_map = {Waqf, Kasf} (Set)

             endings = []
             if isinstance(self.arod_dharbs_map, set):
                 for z_cls in self.arod_dharbs_map:
                     try:
                         endings.append(z_cls(self.last_tafeela).modified_tafeela)
                     except AssertionError:
                         continue
             else:
                 # If it's a dict (some Mashtoors might use dict?), iterate keys
                 for z_cls in self.arod_dharbs_map:
                     try:
                         endings.append(z_cls(self.last_tafeela).modified_tafeela)
                     except AssertionError:
                         continue

             permutations = list(itertools.product(*hashw, endings))
             for p in permutations:
                 # p is a tuple of Tafeela objects
                 feet_strs = [str(t) for t in p]
                 full_str = "".join(feet_strs)
                 patterns["sadr"].append({
                     "pattern": full_str,
                     "feet": feet_strs,
                     "type": "single_shatr"
                 })
                 # Pairs logic doesn't apply or is trivial
                 patterns["pairs"].add((full_str, ""))

        else:
            # Two shatrs
            sadr_hashw = self.get_shatr_hashw_combinations(0)
            ajuz_hashw = self.get_shatr_hashw_combinations(1)

            for arudh_z_cls, dharb_z_list in self.arod_dharbs_map.items():
                # 1. Generate Arudh (End of Sadr)
                try:
                    arudh_obj = arudh_z_cls(self.last_tafeela).modified_tafeela
                except AssertionError:
                    continue

                arudh_str = str(arudh_obj)

                # 2. Generate Sadr variations for this Arudh
                sadr_perms = list(itertools.product(*sadr_hashw, [arudh_obj]))

                for sp in sadr_perms:
                    feet_strs = [str(t) for t in sp]
                    full_sadr = "".join(feet_strs)

                    patterns["sadr"].append({
                        "pattern": full_sadr,
                        "feet": feet_strs,
                        "arudh_foot": arudh_str,
                        "arudh_class": arudh_z_cls.__name__
                    })

                    # 3. Generate compatible Dharbs (End of Ajuz)
                    # dharb_z_list is tuple of allowed classes for this Arudh
                    compatible_dharbs = []
                    for d_z in dharb_z_list:
                        try:
                            dharb_obj = d_z(self.last_tafeela).modified_tafeela
                            compatible_dharbs.append(dharb_obj)
                        except AssertionError:
                            continue

                    if not compatible_dharbs:
                        continue

                    # 4. Generate Ajuz variations for these Dharbs
                    ajuz_perms = list(itertools.product(*ajuz_hashw, compatible_dharbs))

                    for ap in ajuz_perms:
                        feet_strs_a = [str(t) for t in ap]
                        full_ajuz = "".join(feet_strs_a)

                        patterns["ajuz"].append({
                            "pattern": full_ajuz,
                            "feet": feet_strs_a,
                            "dharb_foot": feet_strs_a[-1],
                            "allowed_arudhs": [arudh_str] # Valid only if Sadr ended with this
                        })

                        # Register valid pair
                        patterns["pairs"].add((full_sadr, full_ajuz))

        # Deduplicate lists (dicts are not hashable, use careful logic or just return list)
        # Actually, we generated duplicates if multiple Arudh classes result in same pattern?
        # It's fine for now. The Processor will handle matching.

        # Add sub-bahrs
        for sub in self.sub_bahrs:
            sub_p = sub().detailed_patterns
            patterns["sadr"].extend(sub_p["sadr"])
            patterns["ajuz"].extend(sub_p["ajuz"])
            patterns["pairs"].update(sub_p["pairs"])

        return patterns

    @property
    def bait_combinations(self):
        # Deprecated wrapper for backward compatibility
        # Returns flattened list of full lines
        p = self.detailed_patterns
        if self.only_one_shatr:
            return sorted(list(set(x["pattern"] for x in p["sadr"])), key=len)

        # Reconstruct full lines from pairs
        return sorted([s+a for s,a in p["pairs"]], key=len)

detailed_patterns property

Returns structured patterns for Sadr and Ajuz separately.

get_allowed_feet_patterns(shatr_index=0)

Returns a list of lists, where index i contains all valid binary strings for foot i. Used for granular analysis to align input to valid feet.

Source code in pyarud/bahr.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def get_allowed_feet_patterns(self, shatr_index=0):
    """
    Returns a list of lists, where index i contains all valid binary strings for foot i.
    Used for granular analysis to align input to valid feet.
    """
    allowed_per_index = []

    # Hashw feet
    hashw_combs = self.get_shatr_hashw_combinations(shatr_index)
    for _, forms in enumerate(hashw_combs):
        allowed_per_index.append([str(f) for f in forms])

    # Last foot (Arudh/Dharb)
    last_feet = set()
    if self.only_one_shatr:
        # Treat endings as Arudh
        if isinstance(self.arod_dharbs_map, set):
            for z_cls in self.arod_dharbs_map:
                try:
                    last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                except AssertionError:
                    continue
        else:
            for z_cls in self.arod_dharbs_map:
                try:
                    last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                except AssertionError:
                    continue
    else:
        if shatr_index == 0:  # Sadr -> Arudh
            for z_cls in self.arod_dharbs_map.keys():
                try:
                    last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                except AssertionError:
                    continue
        else:  # Ajuz -> Dharb
            for d_list in self.arod_dharbs_map.values():
                for z_cls in d_list:
                    try:
                        last_feet.add(str(z_cls(self.last_tafeela).modified_tafeela))
                    except AssertionError:
                        continue

    allowed_per_index.append(list(last_feet))
    return allowed_per_index

Arudi Converter

pyarud.arudi.ArudiConverter

Source code in pyarud/arudi.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
class ArudiConverter:
    def __init__(self):
        self.harakat = [KASRA, FATHA, DAMMA]  # kasra, fatha, damma
        self.sukun = [SUKUN]  # sukun
        self.mostly_saken = [ALEF, WAW, ALEF_MAKSURA, YEH]  # alef, waw, alef maqsurah, ya'a
        self.tnween_chars = [DAMMATAN, KASRATAN, FATHATAN]  # damm, kasra, fatha tanween
        self.shadda_chars = [SHADDA]
        self.all_chars = list(LETTERS + " ")
        self.prem_chars = (
            self.harakat + self.sukun + self.mostly_saken + self.tnween_chars + self.shadda_chars + self.all_chars
        )

        # Word replacements for Arudi writing
        self.CHANGE_LST = {
            "هذا": "هَاذَا",
            "هذه": "هَاذِه",
            "هذان": "هَاذَان",
            "هذين": "هَاذَين",
            "هؤلاء": "هَاؤُلَاء",
            "ذلك": "ذَالِك",
            "ذلكما": "ذَالِكُمَا",
            "ذلكم": "ذَالِكُم",
            "أولئك": "أُلَائِك",
            "أولئكم": "أُلَائِكُم",
            "الله": "اللَّاه",
            "اللهم": "اللَّاهُمّ",
            "إله": "إِلَاه",
            "الإله": "الإِلَاه",
            "إلهي": "إِلَاهي",
            "إلهنا": "إِلَاهنا",
            "إلهكم": "إِلَاهكم",
            "إلههم": "إِلَاههم",
            "إلههن": "إِلَاههن",
            "رحمن": "رَحمَان",
            "الرحمن": "الرَّحمَان",
            "طاوس": "طَاوُوس",
            "داود": "دَاوُود",
            "لكن": "لَاكِن",
            "لكنّ": "لَاكِنّ",
            "لكنه": "لَاكِنّهُ",
            "طه": "طَاهَ" + FATHA,
            "لله": "لِللَاهِ",
            "آه": "أَاهِ",
            "هو": "هْوَ",
            "هي": "هْيَ",
        }

    def register_custom_spelling(self, word, replacement):
        """
        Register a custom Arudi spelling for a specific word.

        Args:
            word (str): The word (without diacritics) to replace (e.g., 'لكن').
            replacement (str): The phonetic Arudi spelling (e.g., 'لَاكِن').
        """
        self.CHANGE_LST[word] = replacement

    def _normalize_shadda(self, text):
        # Ensure Shadda comes before Harakat/Tanween
        harakat_all = "".join(self.harakat + self.tnween_chars)
        shadda = "".join(self.shadda_chars)
        return re.sub(f"([{harakat_all}])([{shadda}])", r"\2\1", text)

    def _normalize_orthography(self, text):
        # Normalize Dagger Alif (Superscript Alif) to standard Alif
        text = text.replace("\u0670", ALEF)

        # Remove Harakat from standard Alif (ALEF cannot carry vowel unless it's Hamza)
        # This fixes cases where text has L+A+Fatha (treated as L+A(mover))
        harakat_pattern = f"[{FATHA}{DAMMA}{KASRA}]"
        text = re.sub(f"{ALEF}{harakat_pattern}", ALEF, text)

        # Normalize Alif + Tanween Fath -> Tanween Fath + Alif
        # (Ensures consistent processing order)
        text = re.sub(f"{ALEF}{FATHATAN}", f"{FATHATAN}{ALEF}", text)

        return text

    def _normalize_ligatures(self, text):
        # Decompose Lam-Alif ligatures with potential diacritics
        # Matches Ligature + Optional Haraka
        # Replaces with Lam + Optional Haraka + Second Letter

        harakat_pattern = f"[{''.join(self.harakat + self.tnween_chars)}]"

        def replace_la(match):
            # match.group(0) is the ligature + optional haraka
            # We want L + haraka (if any) + A
            s = match.group(0)
            haraka = s[1:] if len(s) > 1 else ""
            return "ل" + haraka + "ا"

        def replace_la_hamza_above(match):
            s = match.group(0)
            haraka = s[1:] if len(s) > 1 else ""
            return "ل" + haraka + "أ"

        def replace_la_hamza_below(match):
            s = match.group(0)
            haraka = s[1:] if len(s) > 1 else ""
            return "ل" + haraka + "إ"

        def replace_la_madda(match):
            s = match.group(0)
            haraka = s[1:] if len(s) > 1 else ""
            return "ل" + haraka + "آ"

        text = re.sub(f"ﻻ({harakat_pattern})?", replace_la, text)
        text = re.sub(f"ﻷ({harakat_pattern})?", replace_la_hamza_above, text)
        text = re.sub(f"ﻹ({harakat_pattern})?", replace_la_hamza_below, text)
        text = re.sub(f"ﻵ({harakat_pattern})?", replace_la_madda, text)

        return text

    def _resolve_wasl(self, text):
        """
        Handles Hamzat al-Wasl (Connecting Alif) and Iltiqa al-Sakinayn.
        1. Drop Long Vowel + Space + Alif Wasl (e.g. "Idhā Ishtadda" -> "Idhshtadda").
        2. Drop Space + Alif Wasl (e.g. "Bika Al-" -> "Bikal-").
        """
        # Pattern: Letter + (Optional Diacritic) + (Long Vowel) + Space + Alif -> Letter + Diacritic
        # The original regex was flawed because it did not capture the diacritic.
        # This version captures the letter and its optional diacritic, preserving it.
        # Using S* to handle multiple diacritics (e.g., shadda + fatha).
        text = re.sub(r"([^\s]\S*)([اىيو])\s+ا", r"\1", text)

        # Pattern: Space + Alif (Wasl) -> Drop both
        # Matches any word starting with bare Alif preceded by space.
        text = re.sub(r"\s+ا", "", text)

        # 3. Drop Alif of "Allah" if prefixed by Fa/Wa/Ba/Ta/Kaf
        # Pattern: (Prefix)(Vowel?)Alif(LamLam) -> (Prefix)(Vowel?)LamLam
        prefixes = "\u0641\u0648\u0628\u062a\u0643"
        harakat = "".join(self.harakat)
        text = re.sub(f"([{prefixes}])([{harakat}]?)ا(لل)", r"\1\2\3", text)

        return text

    def _handle_space(self, plain_chars):
        if not plain_chars:
            return plain_chars

        if plain_chars[-1] == " ":
            return plain_chars[:-2]
        else:
            return plain_chars[:-1]

    def _remove_extra_harakat(self, text):
        out = ""
        i = 0
        while i < len(text):
            if i < len(text) - 1:
                if text[i] in self.harakat and text[i + 1] in self.harakat:
                    i += 1
                    continue
            out += text[i]
            i += 1
        return out

    def _process_specials_before(self, bait):
        # Handle specific starting Alif cases
        if bait and bait[0] == "ا":
            # Heuristic: randomly choose or based on context. Bohour used random.
            # We'll default to Fatha for consistency in deterministic output,
            # or Hamza with Fatha.
            bait = "أَ" + bait[1:]

        # Detach prefixes to handle Al- logic (WaAl -> Wa Al)
        # Matches: Fa, Waw, Ba, Ta, Kaf followed by Al, at start of word
        bait = re.sub(r"(^|\s)([فوبتك])([َُِ])?ال", r"\1\2\3 ال", bait)

        # Solar Lam Handling: Al + Sun Letter -> A + Sun Letter
        # Drops the Lam which is silent in Solar cases
        sun_letters = "تثدذرزسشصضطظلن"
        bait = re.sub(f" ال([{sun_letters}])", r" ا\1", bait)

        bait = bait.replace("وا ", "و ")
        if bait.endswith("وا"):
            bait = bait[:-1]

        bait = bait.replace("وْا", "و")
        if bait.endswith("وْا"):
            bait = bait[:-2] + "و"

        # Common substitutions
        bait = bait.replace("الله", "اللاه")
        bait = bait.replace("اللّه", "الله")
        bait = bait.replace("إلَّا", "إِلّا")
        bait = bait.replace("نْ ال", "نَ ال")
        bait = bait.replace("لْ ال", "لِ ال")
        bait = bait.replace("إلَى", "إِلَى")
        bait = bait.replace("إذَا", "إِذَا")
        bait = bait.replace("ك ", "كَ ")
        bait = bait.replace(" ال ", " الْ ")
        bait = bait.replace("ْ ال", "ِ ال")
        bait = bait.replace("عَمْرٍو", "عَمْرٍ")
        bait = bait.replace("عَمْرُو", "عَمْرُ")

        # Word replacements from CHANGE_LST
        out = []
        valid_prefixes = ["و", "ف", "ك", "ب", "ل", "وب", "فك", "ول", "فل"]

        # Prepare regex for stripping harakat but keeping shadda
        # Exclude SHADDA from removal list
        removable_chars = self.harakat + self.sukun + self.tnween_chars
        strip_harakat_pattern = f"[{''.join(removable_chars)}]"

        for word in bait.split(" "):
            # 1. Try match with Shadda preserved (e.g. for 'لكنّ')
            cleaned_with_shadda = re.sub(strip_harakat_pattern, "", word)
            # 2. Try match with Shadda removed (standard)
            cleaned_plain = strip_tashkeel(word)

            found = False

            # Check Exact Match (Shadda first, then Plain)
            for candidate in [cleaned_with_shadda, cleaned_plain]:
                if candidate in self.CHANGE_LST:
                    out.append(self.CHANGE_LST[candidate])
                    found = True
                    break
            if found: 
                continue

            # Prefix check
            # We iterate candidates again to check prefixes
            for candidate in [cleaned_with_shadda, cleaned_plain]:
                if found:
                    break
                for key, replacement in self.CHANGE_LST.items():
                    if candidate.endswith(key):
                        prefix = candidate[:-len(key)]
                        if prefix in valid_prefixes:
                            prefix_harakat = {
                                "و": "وَ", "ف": "فَ", "ك": "كَ", "ب": "بِ", "ل": "لِ"
                            }

                            # Construct new word
                            new_prefix = ""
                            for p_char in prefix:
                                new_prefix += prefix_harakat.get(p_char, p_char) 

                            out.append(new_prefix + replacement)
                            found = True
                            break

            if not found:
                out.append(word)

        bait = " ".join(out)

        # Ensure second char isn't a bare letter if first is
        if len(bait) > 1 and bait[1] in self.all_chars:
            bait = bait[0] + self.harakat[1] + bait[1:]

        # Filter trailing alif after tanween
        final_chars = []
        i = 0
        while i < len(bait):
            if bait[i] == "ا" and i > 0 and bait[i - 1] in self.tnween_chars:
                i += 1
                # skip following harakat if any
                if i < len(bait) and bait[i] in self.harakat + self.sukun + self.tnween_chars + self.shadda_chars:
                    i += 1
                continue
            final_chars.append(bait[i])
            i += 1

        return "".join(final_chars)

    def _process_specials_after(self, bait):
        bait = bait.replace("ةن", "تن")
        return bait

    def _extract_pattern(self, text, saturate=True, muqayyad=False):
        """
        Core logic to extract binary pattern and arudi text.
        Based on Bohour's extract_tf3eelav3.
        """
        text = self._remove_extra_harakat(text)
        chars = list(text.replace(ALEF_MADDA, "ءَا").strip())  # Replace Madda
        chars = [c for c in chars if c in self.prem_chars]
        chars = list(re.sub(" +", " ", "".join(chars).strip()))

        # DEBUG
        # print(f"Trace: {chars}")

        out_pattern = ""
        plain_chars = ""

        i = 0
        while i < len(chars) - 1:
            char = chars[i]
            next_char = chars[i + 1]
            # print(f"i={i}, char={char}, next={next_char}")

            if char in self.all_chars:
                if char == " ":
                    plain_chars += char
                    i += 1
                    continue

                # Lookahead
                if next_char == " " and i + 2 < len(chars):
                    next_char = chars[i + 2]

                next_next_char = None
                if i < len(chars) - 2:
                    next_next_char = chars[i + 2]

                prev_digit = out_pattern[-1] if len(out_pattern) > 0 else ""

                # Logic
                if next_char in self.harakat:
                    # Check for Muqayyad (Restricted Rhyme) at the very end
                    # If we are at the last character group (char + haraka is end of string)
                    is_last_group = (i + 2 >= len(chars))
                    # Or if followed by space then end? (Arudi usually strips trailing spaces but let's be safe)

                    if muqayyad and is_last_group:
                        # Treat as Sakin (drop vowel)
                        if prev_digit != "0":
                            out_pattern += "0"
                            plain_chars += char
                        else:
                            # If prev was Sakin, we have Iltiqa Sakinayn at end.
                            # In Muqayyad rhyme, this is allowed (e.g. 'Mard').
                            # But typically we avoid 00. 
                            # Standard Arudi: 00 is allowed at end (Waqf).
                            out_pattern += "0"
                            plain_chars += char
                        # Skip the haraka
                    else:
                        out_pattern += "1"
                        plain_chars += char

                elif next_char in self.sukun:
                    if prev_digit != "0":
                        out_pattern += "0"
                        plain_chars += char
                    elif (i + 1) == len(chars) - 1:
                        # End of line sukun handling: Allow consecutive Sukun (00)
                        out_pattern += "0"
                        plain_chars += char
                    else:
                        plain_chars = self._handle_space(plain_chars) + char

                elif next_char in self.tnween_chars:
                    if char != "ا":
                        plain_chars += char
                    plain_chars += "ن"
                    out_pattern += "10"

                    # Skip trailing Alif (Tanween Fath)
                    if i + 2 < len(chars) and chars[i + 2] == "ا":
                        i += 1

                elif next_char in self.shadda_chars:
                    if prev_digit != "0":
                        plain_chars += char + char
                        out_pattern += "01"
                    else:
                        plain_chars = self._handle_space(plain_chars) + char + char
                        out_pattern += "1"

                    # Check what follows Shadda
                    if i + 2 < len(chars):
                        if chars[i + 2] in self.harakat:
                            # Check Muqayyad for Shadda+Harakah at end?
                            # Example: "Radd" (R + Shadda).
                            # If "Raddu" -> R(0) R(1).
                            # If Muqayyad "Radd" -> R(0) R(0).
                            is_last_shadda_group = (i + 3 >= len(chars))
                            if muqayyad and is_last_shadda_group:
                                # We already added '01' or '1'. The '1' corresponds to the second letter being Mover.
                                # If Muqayyad, the second letter should be Sakin.
                                # So '01' -> '00'. '1' -> '0'.
                                # We need to fix the last digit added.
                                out_pattern = out_pattern[:-1] + "0"
                                # Skip the harakah
                                i += 1
                            else:
                                i += 1  # Skip harakat processing next loop
                        elif chars[i + 2] in self.tnween_chars:
                            i += 1
                            plain_chars += "ن"
                            out_pattern += "0"

                            # Skip trailing Alif (Shadda + Tanween Fath)
                            if i + 2 < len(chars) and chars[i + 2] == "ا":
                                i += 1

                elif next_char in [ALEF, ALEF_MAKSURA]:
                    out_pattern += "10"
                    plain_chars += char + next_char

                elif next_char in self.all_chars:
                    # Letter followed by Letter (implies first is Sakin if no haraka in betweeen?)
                    # Or assumes implicit sukun?
                    if prev_digit != "0":
                        out_pattern += "0"
                        plain_chars += char
                    elif prev_digit == "0" and i + 1 < len(chars) and chars[i + 1] == " ":
                        # Special case from Bohour
                        out_pattern += "1"
                        plain_chars += char
                    else:
                        plain_chars = self._handle_space(plain_chars) + char
                        out_pattern += "0"
                    i -= 1  # Backtrack? This logic in Bohour is tricky.
                    # If we assumed it was a letter but it's followed by a letter, we treat current as sakin.
                    # The i -= 1 might be to re-process? No, i += 2 at end.

                # Ha' al-Gha'ib (He) handling
                # Only saturate if previous letter was Mutaharrik (prev_digit != "0")
                # And NOT muqayyad (if muqayyad, we don't saturate)
                if not muqayyad and next_next_char == " " and prev_digit != "0":
                    if char == "ه":
                        if next_char == self.harakat[0]:  # Kasra
                            plain_chars += "ي"
                            out_pattern += "0"
                        if next_char == self.harakat[2]:  # Damma
                            plain_chars += "و"
                            out_pattern += "0"

                i += 2  # Advance past char and its diacritic/follower
            elif char == "ا":
                # Alef encountered as 'char' (e.g. after a diacritic consumed the previous letter)
                out_pattern += "0"
                plain_chars += char
                i += 1
            else:
                i += 1

        # Finalize
        # If Muqayyad, we don't saturate.
        # If Not Muqayyad, we saturate.

        if not muqayyad and saturate and out_pattern and out_pattern[-1] != "0":
            out_pattern += "0"  # Always end with sukun (Qafiyah)

        # Ashba' (Saturation) of last letter
        # Only if not muqayyad
        if not muqayyad and saturate and chars:
            last_char = chars[-1]
            if last_char == self.harakat[0]:  # Kasra
                plain_chars += "ي"
            elif last_char == self.tnween_chars[1]:  # Kasr Tanween
                plain_chars = plain_chars[:-1] + "ي"
            elif last_char == self.harakat[1]:  # Fatha
                plain_chars += "ا"
            elif last_char == self.harakat[2]:  # Damma
                plain_chars += "و"
            elif last_char == self.tnween_chars[0]:  # Damm Tanween
                plain_chars = plain_chars[:-1] + "و"
            elif last_char in self.mostly_saken and len(chars) > 1 and chars[-2] not in self.tnween_chars:
                plain_chars += last_char

        return plain_chars, out_pattern

    def prepare_text(self, text, saturate=True, muqayyad=False):
        """
        Converts standard Arabic text into Arudi style and extracts the binary pattern.
        """
        text = text.strip()
        if not text:
            return "", ""

        # print(f"Original: {text}")
        text = self._normalize_orthography(text)
        # print(f"Norm Ortho: {text}")
        text = self._normalize_ligatures(text)
        text = self._normalize_shadda(text)
        preprocessed = self._process_specials_before(text)
        # print(f"Specials Before: {preprocessed}")
        preprocessed = self._resolve_wasl(preprocessed)
        # print(f"Resolve Wasl: {preprocessed}")
        arudi_style, pattern = self._extract_pattern(preprocessed, saturate=saturate, muqayyad=muqayyad)
        arudi_style = self._process_specials_after(arudi_style)

        return arudi_style, pattern

prepare_text(text, saturate=True, muqayyad=False)

Converts standard Arabic text into Arudi style and extracts the binary pattern.

Source code in pyarud/arudi.py
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
def prepare_text(self, text, saturate=True, muqayyad=False):
    """
    Converts standard Arabic text into Arudi style and extracts the binary pattern.
    """
    text = text.strip()
    if not text:
        return "", ""

    # print(f"Original: {text}")
    text = self._normalize_orthography(text)
    # print(f"Norm Ortho: {text}")
    text = self._normalize_ligatures(text)
    text = self._normalize_shadda(text)
    preprocessed = self._process_specials_before(text)
    # print(f"Specials Before: {preprocessed}")
    preprocessed = self._resolve_wasl(preprocessed)
    # print(f"Resolve Wasl: {preprocessed}")
    arudi_style, pattern = self._extract_pattern(preprocessed, saturate=saturate, muqayyad=muqayyad)
    arudi_style = self._process_specials_after(arudi_style)

    return arudi_style, pattern