Check for duplicated questions#

import re
import json
import string
punctuation = string.punctuation + '’ '

def get_text(q):
    items = re.sub('[%s]+' % re.escape(punctuation), ' ', q).lower().strip()
    return ''.join(items)
with open('../data/ml_questions_tmp.json') as f:
    data = json.load(f)
ids = []
questions = []

for source in data.values():
    for sub_source in source.values():
        for q in sub_source:
            q_text = ''
            for element in q['body']:
                q_text += element if type(element) is str else ' '.join(element) 
            q_text = get_text(q_text)
            ids.append(q['id'])
            questions.append(q_text)
duplicates = []

for i in range(len(questions)):
    for j in range(i+1, len(questions)):
        if questions[i] == questions[j]:
            duplicates.append((ids[i], ids[j]))

duplicates
[('google_ml_crash_regularization_simplicity_1',
  'google_ml_crash_regularization_sparsity_1')]