Check for duplicated questions
Check for duplicated questions#
import re
import json
import string
punctuation = string.punctuation + '’ '
def get_text(q):
items = re.sub('[%s]+' % re.escape(punctuation), ' ', q).lower().strip()
return ''.join(items)
with open('../data/ml_questions_tmp.json') as f:
data = json.load(f)
ids = []
questions = []
for source in data.values():
for sub_source in source.values():
for q in sub_source:
q_text = ''
for element in q['body']:
q_text += element if type(element) is str else ' '.join(element)
q_text = get_text(q_text)
ids.append(q['id'])
questions.append(q_text)
duplicates = []
for i in range(len(questions)):
for j in range(i+1, len(questions)):
if questions[i] == questions[j]:
duplicates.append((ids[i], ids[j]))
duplicates
[('google_ml_crash_regularization_simplicity_1',
'google_ml_crash_regularization_sparsity_1')]