Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from tqdm import tqdm | |
| from ckb_helpers import * | |
| df = pd.read_csv('data/asotest.csv') | |
| data_df = pd.read_csv('data/data.txt', names=['text']) | |
| train_df = pd.read_csv('train.csv') | |
| data = [] | |
| pbar = tqdm(df.itertuples(), total=len(df)) | |
| for row in pbar: | |
| incorrect_word = row.text | |
| correct_word = row.summary | |
| # look up sentences from data_df that contain correct_word and make only keep those rows that are not in train_df | |
| sentences = data_df[data_df['text'].str.contains(correct_word, case=False, na=False)] | |
| sentences = sentences[~sentences.text.isin(train_df.summary)] | |
| pbar.set_description(f"Rows found after cross checking train data: {len(sentences)} for {correct_word}") | |
| for r in sentences.head(1).itertuples(): | |
| new_sentence = r.text.replace(correct_word, incorrect_word) | |
| data.append({"text": new_sentence, "summary": process_text(r.text)}) | |
| # drop that row so the final dataset doesn't include same sentence for two incorrect words | |
| data_df.drop(index=r.Index, axis=0, inplace=True) | |
| df = pd.DataFrame(data) | |
| df.to_csv('asosoft_spell.csv', index=False) | |