Spaces:
Paused
Paused
| """Teacher Agent using Upper Confidence Bound (UCB) bandit algorithm.""" | |
| import numpy as np | |
| from typing import Dict, List | |
| from interfaces import TeacherAction, StudentState, TeacherAgentInterface | |
| def compute_reward( | |
| accuracy_before: float, | |
| accuracy_after: float, | |
| difficulty: str, | |
| is_review: bool | |
| ) -> float: | |
| """ | |
| Compute reward for teacher action. | |
| Reward structure: | |
| - Base: improvement in accuracy | |
| - Bonus: harder tasks encourage pushing boundaries | |
| - Bonus: successful reviews (spaced repetition) | |
| - Penalty: wasted reviews (student still remembers perfectly) | |
| """ | |
| improvement = accuracy_after - accuracy_before | |
| # Bonus for harder tasks (encourage pushing boundaries) - expanded for all 7 levels | |
| difficulty_bonus_map = { | |
| 'trivial': 0.2, | |
| 'easy': 0.5, | |
| 'medium': 1.0, | |
| 'hard': 2.0, | |
| 'expert': 3.0, | |
| 'master': 4.0, | |
| 'grandmaster': 5.0 | |
| } | |
| difficulty_bonus = difficulty_bonus_map.get(difficulty, 1.0) | |
| # Bonus for successful reviews (spaced repetition) | |
| review_bonus = 1.0 if (is_review and improvement > 0) else 0.0 | |
| # Penalty for wasted reviews (student still remembers perfectly) | |
| review_penalty = -0.5 if (is_review and accuracy_after > 0.9) else 0.0 | |
| return improvement + difficulty_bonus + review_bonus + review_penalty | |
| class TeacherAgent(TeacherAgentInterface): | |
| """ | |
| Teacher Agent using UCB (Upper Confidence Bound) bandit algorithm. | |
| Action space: Dynamically determined from task generator | |
| - Topics: From MockTaskGenerator (15 topics) | |
| - Difficulties: From MockTaskGenerator (7 difficulties: trivial→grandmaster) | |
| - Options: 2 (new vs review) | |
| UCB formula: | |
| UCB(a) = estimated_reward(a) + exploration_bonus × sqrt(log(total_pulls) / pulls(a)) | |
| Balances exploration (trying new actions) vs exploitation (using known-good actions). | |
| """ | |
| def __init__(self, exploration_bonus: float = 2.0, task_generator=None): | |
| """ | |
| Initialize teacher agent with dynamic action space. | |
| Args: | |
| exploration_bonus: Controls exploration vs exploitation balance. | |
| Higher = more exploration (try new actions) | |
| Lower = more exploitation (use known-good actions) | |
| task_generator: Optional MockTaskGenerator to get topics/difficulties. | |
| If None, uses default expanded set. | |
| """ | |
| self.exploration_bonus = exploration_bonus | |
| # Define action space dynamically | |
| if task_generator: | |
| self.topics = task_generator.get_available_topics() | |
| self.difficulties = task_generator.get_available_difficulties() | |
| else: | |
| # Default expanded set | |
| self.topics = [ | |
| 'history', 'science', 'literature', 'geography', 'current_events', | |
| 'mathematics', 'programming', 'philosophy', 'art', 'music', | |
| 'biology', 'chemistry', 'physics', 'economics', 'psychology' | |
| ] | |
| self.difficulties = ['trivial', 'easy', 'medium', 'hard', 'expert', 'master', 'grandmaster'] | |
| self.review_options = [False, True] # False = new, True = review | |
| # Create all action combinations | |
| self.actions = [ | |
| (topic, diff, review) | |
| for topic in self.topics | |
| for diff in self.difficulties | |
| for review in self.review_options | |
| ] | |
| self.num_actions = len(self.actions) # Now 15 topics × 7 difficulties × 2 = 210 actions | |
| # Track statistics per action | |
| self.action_counts = np.zeros(self.num_actions, dtype=np.float64) | |
| self.action_rewards = np.zeros(self.num_actions, dtype=np.float64) | |
| self.total_pulls = 0 | |
| def select_action(self, student_state: StudentState) -> TeacherAction: | |
| """ | |
| Select next action using UCB algorithm. | |
| For each action: | |
| - If never tried: select it (cold start) | |
| - Otherwise: compute UCB score and select highest | |
| """ | |
| # Cold start: try each action at least once | |
| untried_actions = [i for i in range(self.num_actions) if self.action_counts[i] == 0] | |
| if untried_actions: | |
| action_idx = self.total_pulls % len(untried_actions) | |
| selected_idx = untried_actions[action_idx] | |
| else: | |
| # All actions tried - use UCB | |
| ucb_scores = self._compute_ucb_scores() | |
| selected_idx = np.argmax(ucb_scores) | |
| return self._index_to_action(selected_idx) | |
| def _compute_ucb_scores(self) -> np.ndarray: | |
| """Compute UCB score for each action.""" | |
| scores = np.zeros(self.num_actions) | |
| for i in range(self.num_actions): | |
| if self.action_counts[i] == 0: | |
| # Never tried - give high score for exploration | |
| scores[i] = float('inf') | |
| else: | |
| # Estimated reward (average so far) | |
| estimated_reward = self.action_rewards[i] / self.action_counts[i] | |
| # Exploration bonus: sqrt(log(total_pulls) / pulls(action)) | |
| exploration_term = np.sqrt( | |
| np.log(max(1, self.total_pulls)) / self.action_counts[i] | |
| ) | |
| # UCB score = estimated reward + exploration bonus | |
| scores[i] = estimated_reward + self.exploration_bonus * exploration_term | |
| return scores | |
| def update(self, action: TeacherAction, reward: float): | |
| """ | |
| Update teacher policy based on reward. | |
| Uses running average: new_avg = old_avg + (reward - old_avg) / count | |
| """ | |
| action_idx = self._action_to_index(action) | |
| # Update statistics | |
| self.action_counts[action_idx] += 1 | |
| n = self.action_counts[action_idx] | |
| # Running average update | |
| old_avg = self.action_rewards[action_idx] / max(1, n - 1) if n > 1 else 0.0 | |
| self.action_rewards[action_idx] = (old_avg * (n - 1)) + reward | |
| self.total_pulls += 1 | |
| def _action_to_index(self, action: TeacherAction) -> int: | |
| """Convert TeacherAction to integer index.""" | |
| try: | |
| topic_idx = self.topics.index(action.topic) | |
| diff_idx = self.difficulties.index(action.difficulty) | |
| review_idx = int(action.is_review) | |
| # Encode: topic * (diffs * reviews) + diff * reviews + review | |
| index = ( | |
| topic_idx * (len(self.difficulties) * len(self.review_options)) + | |
| diff_idx * len(self.review_options) + | |
| review_idx | |
| ) | |
| return index | |
| except (ValueError, AttributeError): | |
| raise ValueError(f"Invalid action: {action}") | |
| def _index_to_action(self, index: int) -> TeacherAction: | |
| """Convert integer index to TeacherAction.""" | |
| if not (0 <= index < self.num_actions): | |
| raise ValueError(f"Invalid action index: {index}") | |
| # Decode: index -> (topic, difficulty, review) | |
| review_idx = index % len(self.review_options) | |
| diff_idx = (index // len(self.review_options)) % len(self.difficulties) | |
| topic_idx = index // (len(self.difficulties) * len(self.review_options)) | |
| return TeacherAction( | |
| topic=self.topics[topic_idx], | |
| difficulty=self.difficulties[diff_idx], | |
| is_review=bool(review_idx) | |
| ) | |
| def get_statistics(self) -> Dict: | |
| """Get teacher statistics for visualization.""" | |
| return { | |
| 'action_counts': self.action_counts.copy(), | |
| 'action_rewards': self.action_rewards.copy(), | |
| 'actions': self.actions.copy(), | |
| 'topics': self.topics.copy(), | |
| 'difficulties': self.difficulties.copy(), | |
| 'review_options': self.review_options.copy(), | |
| 'total_pulls': self.total_pulls | |
| } | |