Spaces:

spruceemmanuel
/

wellsaid

Runtime error

wellsaid / app /services /inclusive_language.py

iamspruce

fixed the api

73a6a7e 6 months ago

5.94 kB

	import logging
	import yaml
	from pathlib import Path
	from typing import List, Dict

	from app.services.base import load_spacy_model
	from app.core.config import settings, APP_NAME, SPACY_MODEL_ID
	from app.core.exceptions import ServiceError

	logger = logging.getLogger(f"{APP_NAME}.services.inclusive_language")


	class InclusiveLanguageChecker:
	def __init__(self, rules_directory: str = settings.INCLUSIVE_RULES_DIR):
	self._nlp = None
	self.matcher = None
	self.rules = self._load_inclusive_rules(Path(rules_directory))

	def _load_inclusive_rules(self, rules_path: Path) -> Dict[str, Dict]:
	"""
	Load YAML-based inclusive language rules from the given directory.
	"""
	if not rules_path.is_dir():
	logger.error(f"Inclusive language rules directory not found: {rules_path}")
	raise ServiceError(
	status_code=500,
	detail=f"Inclusive language rules directory not found: {rules_path}"
	)

	rules = {}
	for yaml_file in rules_path.glob("*.yml"):
	try:
	with yaml_file.open(encoding="utf-8") as f:
	rule_list = yaml.safe_load(f)

	if not isinstance(rule_list, list):
	logger.warning(f"Skipping non-list rule file: {yaml_file}")
	continue

	for rule in rule_list:
	inconsiderate = rule.get("inconsiderate", [])
	considerate = rule.get("considerate", [])
	note = rule.get("note", "")
	source = rule.get("source", "")
	rule_type = rule.get("type", "basic")

	# Ensure consistent formatting
	if isinstance(considerate, str):
	considerate = [considerate]
	if isinstance(inconsiderate, str):
	inconsiderate = [inconsiderate]

	for phrase in inconsiderate:
	rules[phrase.lower()] = {
	"considerate": considerate,
	"note": note,
	"source": source,
	"type": rule_type
	}

	except Exception as e:
	logger.error(f"Error loading rule file {yaml_file}: {e}", exc_info=True)
	raise ServiceError(
	status_code=500,
	detail=f"Failed to load inclusive language rules: {e}"
	)

	logger.info(f"Loaded {len(rules)} inclusive language rules from {rules_path}")
	return rules

	def _get_nlp(self):
	"""
	Lazy-loads the spaCy model for NLP processing.
	"""
	if self._nlp is None:
	self._nlp = load_spacy_model(SPACY_MODEL_ID)
	return self._nlp

	def _init_matcher(self, nlp):
	"""
	Initializes spaCy PhraseMatcher using loaded rules.
	"""
	from spacy.matcher import PhraseMatcher

	matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
	for phrase in self.rules:
	matcher.add(phrase, [nlp.make_doc(phrase)])

	logger.info(f"PhraseMatcher initialized with {len(self.rules)} phrases.")
	return matcher

	async def check(self, text: str) -> dict:
	"""
	Checks a string for non-inclusive language based on rule definitions.
	"""
	text = text.strip()
	if not text:
	raise ServiceError(status_code=400, detail="Input text is empty for inclusive language check.")

	try:
	nlp = self._get_nlp()
	if self.matcher is None:
	self.matcher = self._init_matcher(nlp)

	doc = nlp(text)
	matches = self.matcher(doc)
	results = []
	matched_spans = set()

	# Match exact phrases
	for match_id, start, end in matches:
	phrase = nlp.vocab.strings[match_id].lower()
	if any(s <= start < e or s < end <= e for s, e in matched_spans):
	continue # Avoid overlapping matches

	matched_spans.add((start, end))
	rule = self.rules.get(phrase)
	if rule:
	results.append({
	"term": doc[start:end].text,
	"type": rule["type"],
	"note": rule["note"],
	"suggestions": rule["considerate"],
	"context": doc[start:end].sent.text,
	"start_char": doc[start].idx,
	"end_char": doc[end - 1].idx + len(doc[end - 1]),
	"source": rule["source"]
	})

	# Match individual token lemmas (fallback)
	for token in doc:
	lemma = token.lemma_.lower()
	if (token.i, token.i + 1) in matched_spans:
	continue # Already matched in phrase

	if lemma in self.rules:
	rule = self.rules[lemma]
	results.append({
	"term": token.text,
	"type": rule["type"],
	"note": rule["note"],
	"suggestions": rule["considerate"],
	"context": token.sent.text,
	"start_char": token.idx,
	"end_char": token.idx + len(token),
	"source": rule["source"]
	})

	return {"issues": results}

	except Exception as e:
	logger.error(f"Inclusive language check error for text: '{text[:50]}...'", exc_info=True)
	raise ServiceError(
	status_code=500,
	detail="An internal error occurred during inclusive language checking."
	) from e