| | import pandas as pd |
| | import networkx as nx |
| | import spacy |
| | import pickle |
| | from datetime import datetime |
| | import os |
| |
|
| | |
| | nlp = spacy.load("en_core_web_sm") |
| |
|
| | class KnowledgeGraphBuilder: |
| | def __init__(self, model_dir="models"): |
| | self.model_dir = model_dir |
| | self.knowledge_graph = nx.DiGraph() |
| | |
| | def extract_entities(self, text): |
| | """Extract named entities from text using spaCy""" |
| | try: |
| | |
| | if pd.isna(text) or text is None: |
| | return [] |
| | |
| | |
| | if isinstance(text, (float, int)): |
| | text = str(text) |
| | |
| | |
| | text = str(text).strip() |
| | |
| | |
| | if not text: |
| | return [] |
| | |
| | doc = nlp(text) |
| | entities = [(ent.text, ent.label_) for ent in doc.ents] |
| | return entities |
| | except Exception as e: |
| | print(f"Error processing text: {text}") |
| | print(f"Error message: {str(e)}") |
| | return [] |
| |
|
| | def update_knowledge_graph(self, text, is_real): |
| | """Update knowledge graph with entities and their relationships""" |
| | try: |
| | entities = self.extract_entities(text) |
| | |
| | |
| | if not entities: |
| | return |
| | |
| | |
| | for entity, entity_type in entities: |
| | |
| | if not self.knowledge_graph.has_node(entity): |
| | self.knowledge_graph.add_node( |
| | entity, |
| | type=entity_type, |
| | real_count=1 if is_real else 0, |
| | fake_count=0 if is_real else 1 |
| | ) |
| | else: |
| | |
| | if is_real: |
| | self.knowledge_graph.nodes[entity]['real_count'] += 1 |
| | else: |
| | self.knowledge_graph.nodes[entity]['fake_count'] += 1 |
| | |
| | |
| | for i, (entity1, _) in enumerate(entities): |
| | for entity2, _ in entities[i+1:]: |
| | if not self.knowledge_graph.has_edge(entity1, entity2): |
| | self.knowledge_graph.add_edge( |
| | entity1, |
| | entity2, |
| | weight=1, |
| | is_real=is_real |
| | ) |
| | else: |
| | self.knowledge_graph[entity1][entity2]['weight'] += 1 |
| | except Exception as e: |
| | print(f"Error updating knowledge graph: {str(e)}") |
| |
|
| | def save_knowledge_graph(self, filename=None): |
| | """Save the knowledge graph to a file""" |
| | if filename is None: |
| | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| | filename = os.path.join(self.model_dir, f"knowledge_graph_{timestamp}.pkl") |
| | |
| | os.makedirs(self.model_dir, exist_ok=True) |
| | |
| | |
| | graph_data = { |
| | 'nodes': dict(self.knowledge_graph.nodes(data=True)), |
| | 'edges': {} |
| | } |
| | |
| | |
| | for u, v, data in self.knowledge_graph.edges(data=True): |
| | if u not in graph_data['edges']: |
| | graph_data['edges'][u] = {} |
| | graph_data['edges'][u][v] = data |
| | |
| | try: |
| | with open(filename, 'wb') as f: |
| | pickle.dump(graph_data, f) |
| | print(f"Knowledge graph saved to {filename}") |
| | print(f"Total nodes: {len(graph_data['nodes'])}") |
| | print(f"Total edges: {sum(len(edges) for edges in graph_data['edges'].values())}") |
| | return filename |
| | except Exception as e: |
| | print(f"Error saving knowledge graph: {str(e)}") |
| | return None |
| | |
| | def get_graph_statistics(self): |
| | """Get basic statistics about the knowledge graph""" |
| | stats = { |
| | 'total_nodes': self.knowledge_graph.number_of_nodes(), |
| | 'total_edges': self.knowledge_graph.number_of_edges(), |
| | 'entity_types': {}, |
| | 'reliability_scores': {} |
| | } |
| | |
| | |
| | for node, attrs in self.knowledge_graph.nodes(data=True): |
| | entity_type = attrs.get('type', 'UNKNOWN') |
| | stats['entity_types'][entity_type] = stats['entity_types'].get(entity_type, 0) + 1 |
| | |
| | |
| | real_count = attrs.get('real_count', 0) |
| | fake_count = attrs.get('fake_count', 0) |
| | total = real_count + fake_count |
| | if total > 0: |
| | reliability = real_count / total |
| | stats['reliability_scores'][node] = reliability |
| | |
| | return stats |
| |
|
| | def main(): |
| | |
| | builder = KnowledgeGraphBuilder() |
| | |
| | |
| | df = pd.read_csv('./combined.csv') |
| | |
| | |
| | print("Building knowledge graph...") |
| | total_rows = len(df) |
| | for idx, row in df.iterrows(): |
| | try: |
| | builder.update_knowledge_graph(row['text'], row['label'] == 'REAL') |
| | if (idx + 1) % 100 == 0: |
| | print(f"Processed {idx + 1}/{total_rows} entries ({(idx + 1)/total_rows*100:.1f}%)...") |
| | except Exception as e: |
| | print(f"Error processing row {idx}: {str(e)}") |
| | continue |
| | |
| | |
| | graph_path = builder.save_knowledge_graph() |
| | |
| | |
| | stats = builder.get_graph_statistics() |
| | print("\nKnowledge Graph Statistics:") |
| | print(f"Total nodes: {stats['total_nodes']}") |
| | print(f"Total edges: {stats['total_edges']}") |
| | print("\nEntity types distribution:") |
| | for entity_type, count in stats['entity_types'].items(): |
| | print(f"{entity_type}: {count}") |
| |
|
| | if __name__ == "__main__": |
| | main() |