Qwiklabs Quizzes to JSON#

from bs4 import BeautifulSoup
import datetime
import requests
import json
import string
import re

Raw Data Acquisition#

def get_quizzes_data():

    # Load info data
    with open('../data/gcp_courses.json') as f:
        courses_data = json.load(f)

    url = 'https://www.cloudskillsboost.google/course_sessions'
    cookie = {'_cvl-4_1_14_session': ''}
    quizzes_data = []

    for course_data in courses_data:
        quizzes = []

        for module_data in course_data['modules']:
            questions = []

            for lesson_data in module_data['lessons']:
                if lesson_data['type'] != 'quiz':
                    continue

                quiz_url = f"{url}/{lesson_data['id']}"
                html = requests.get(quiz_url, cookies=cookie).text
                soup = BeautifulSoup(html, 'html.parser')
                quiz_data = json.loads(soup.find('ql-quiz')['quizversion'])
                questions = quiz_data['quizItems']
            
            if questions:
                quizzes.append({
                    'name': module_data['name'],
                    'questions': questions
                })

        if quizzes:
            quizzes_data.append({
                'name': course_data['name'],
                'quizzes': quizzes
            })
            
    # Save data
    timestamp = datetime.datetime.now().strftime('%y%m%d')
    file_name = f'qwiklabs_quizzes_chkp_{timestamp}.json'

    with open(f'../data/{file_name}', 'w') as f:
        f.write(json.dumps(quizzes_data, indent=2))

# get_quizzes_data()

Load quizzes data#

file_path = '../data/qwiklabs_quizzes_chkp_230525.json'

with open(file_path) as f:
    quizzes_data = json.load(f)

Preprocessing/Cleaning definitions#

remove_body_start = [
    'Choose the correct three answers.',
    'Select the correct word below to fill in the blank:',
    'Fill in the blank: <br>',
    'Fill in the blanks: <br>',
    'Fill in the blank:',
    'Fill in the blanks.',
    'Fill in the blank.',
    'Fill in the blanks:'
]

remove_body_end = [
    '(Choose 2 correct Answers)',
    '(Choose two answers)',
    'There could be more than one answer.',
    'Credit is given for selecting the correct three.',
    '(check all that apply)',
    '<br>'
]

all_opts = 'All of the options.'

replace_exp = {
    'This answer is partially correct, please review the module again.': 'Partially correct.',
    'This is one of the correct answers.': 'Partially correct.'
}   

remove_exp_start = [
    'Correct Answer',
    'Correct!',
    'Correct.',
    'Correct',
    'Incorrect, please review the module',
    'That answer is correct!',
    'That answer is incorrect. Please review the lesson again.',
    'This answer is correct! All three correct answers must be selected to receive credit for the question.',
    'This answer is correct!',
    'This answer is correct..',
    'This answer is correct.',
    'This answer is correct',
    'This answer is incorrect, please review the module again.',
    'This answer is incorrect, please review the module again',
    'This answer is incorrect.',
    'This answer is not correct, please review the module again.',
    'This answer is not correct. Please review the module.',
    'This answer is not correct. Review the module.',
    'This answer is not correct.',
    'This answer is not correct;',
    'This is correct.',
    'This is incorrect, please review the module again.',
    'This is incorrect, please review the module again',
    'This is one of the correct answers.',
    'This option is correct',
    'This option is incorrect'
]

replace_exp_end = {
    ',” and try again.': '”.'
}

contains_exp = {
    'Please review': 'Review'
}

def remove_extra_spaces(text):
    return re.sub('\s+', ' ', text)

def preprocess_body(text):
    for sentence in remove_body_start:
        if text.startswith(sentence):
            text = text[len(sentence):].lstrip()
            break

    for sentence in remove_body_end:
        if text.endswith(sentence):
            text = text[:-len(sentence)].rstrip()
            break

    return remove_extra_spaces(text)

def preprocess_opt(text):
    if text.startswith('All o'):
        text = all_opts
    else:
        text = remove_extra_spaces(text)
        
    return text

def preprocess_exp(exp_text):
    exp = replace_exp.get(exp_text, exp_text)
    text = re.sub('=+', '', exp)

    for sentence in remove_exp_start:
        if text.startswith(sentence):
            text = text[len(sentence):].lstrip()
            break

    for sentence in replace_exp_end.keys():
        if text.endswith(sentence):
            text = text[:-len(sentence)] + replace_exp_end[sentence]

    for k, v in contains_exp.items():
        text = text.replace(k, v)

    return text

def get_item_id(resource):
    punctuation = string.punctuation + '’'
    item = re.sub('[%s ]+' % re.escape(punctuation), '_', resource).lower()

    keywords = {
        'introduction': 'intro',
        'computer_vision': 'cv',
        'convolutional_neural_networks': 'cnns',
        'convolutional_neural_network': 'cnn',
        'deep_neural_networks': 'dnns',
        'deep_neural_network': 'dnn',
        'neural_networks': 'nns',
        'neural_network': 'nn',
        'google_cloud': 'gcloud',
        'bigquery': 'bq',
        'tensorflow': 'tf',
        'machine_learning': 'ml',
        'natural_language_processing': 'nlp'    
    }

    for key, value in keywords.items():
        item = item.replace(key, value)

    if item.endswith('_'):
        item = item[:-1]

    return item

def review_question(q):
            
    if q['id'] == 'how_google_does_ml_3_2':
        for option in q['options']:
            option['option'][0] = option['option'][0].replace('orBigQuery', 'or BigQuery')  

    elif q['id'] == 'how_google_does_ml_3_3':
        q['references'] = [
            "https://www.bmc.com/blogs/machine-learning-architecture/"]

    elif q['id'] == 'how_google_does_ml_4_3':
        opt = q['options'][0]['option']
        opt[0] = opt[0].replace('suiteof', 'suite of')
      
    elif q['id'] == 'tf_on_gcloud_1_6':
        opt = q['options'][0]['option']
        opt[0] = opt[0].replace('TensorFlowis', 'TensorFlow is')

    elif q['id'] == 'tf_on_gcloud_2_7':
        q['options'][0]['answer'] = False
        for option in q['options']:
            option['explanation'] = ['Partially correct.']

        q['options'].append({
            'option': [all_opts],
            'answer': True,
            'explanation': []
        })

        q['feedback'] = [
            "The original question confused adaptable layers with trainable layers.",
            '"All of the options are correct." option was added.']
        
        q['references'] = [
            'https://www.tensorflow.org/guide/keras/preprocessing_layers#the_adapt_method']

    elif q['id'] == 'tf_on_gcloud_2_8':
        q['references'] = [
            'https://www.tensorflow.org/guide/keras/preprocessing_layers#categorical_features_preprocessing']

    elif q['id'] == 'tf_on_gcloud_4_2':
        opt = q['options'][1]['option']
        opt[0] = opt[0].replace('init_.py', '__init__.py')

        q['references'] = [
            'https://cloud.google.com/vertex-ai/docs/training/create-python-pre-built-container']

    elif q['id'] == 'tf_on_gcloud_4_3':
        q['body'][0] = q['body'][0].replace('Verte', 'Vertex')

    elif q['id'] == 'recommendation_systems_on_gcloud_2_1':
        exp = q['options'][0]['explanation']
        exp[0] = exp[0].replace('scaleable', 'scalable')

    elif q['id'] == 'cv_fundamentals_with_gcloud_4_3':
        q['feedback'] = [
            "It doesn't matter which neuron is trained to process which input values in a dense layer.",
            'If you randomly reshuffle the order of the pixels in the images, the classification performance stays the same, because the corresponding weights are also reshuffled.',
            'However, when human beings look at an image where the pixels are randomly reshuffled, the image looks like noise.',
            'This phenomenon happens because the concept of hierarchy plays a significant role in the human brain.',
            'Information is stored in sequence of patterns, in sequential order.',
            'Similarly, you can expect the CNNs to perform poorly in contrast to DNN models if the image pixels are randomly permitted.',
            'This is because hierarchy, or how pixels are placed next to each other, is a vital part of the CNN model design.'
        ]

        q['references'] = [
            'https://youtu.be/4pcqScI1jhA?t=206']
    
    elif q['id'] == 'feature_engineering_3_6':
        q['body'][0] = q['body'][0].replace(
            'tf.feature_column.bucketized_column', 'tf.keras.layers.Discretization')

    elif q['id'] == 'feature_engineering_4_6':
        opt = q['options'][1]['option']
        opt[0] = opt[0].replace('execution.Notice', 'execution. Notice')

    elif q['id'] == 'feature_engineering_6_1':
        opt = q['options'][0]['option']
        opt[0] = opt[0].replace('Apache', 'Apache Beam')

    elif q['id'] == 'production_ml_systems_3_4':
        q['options'][3]['explanation'] = []

Data processing#

processed_data = {}

for course_data in quizzes_data:
    course_id = get_item_id(course_data['name'])
    questions = []
    num_quiz = 0

    for quiz in course_data['quizzes']:
        num_quiz +=1

        for i, item in enumerate(quiz['questions']):
            q = {}
            q['id'] = f'{course_id}_{num_quiz}_{i+1}'
            q['domain'] = []
            q['intro'] = []
            q['body'] = [preprocess_body(item['stem'])]
            q['open'] = False

            options = []

            for opt in item['options']:
                exp = preprocess_exp(opt['rationale'])
                
                options.append({
                    'option': [preprocess_opt(opt['title'])],
                    'answer': opt['isAnswer'],
                    'explanation': [exp] if exp else []
                })
            q['options'] = options

            q['feedback'] = []
            q['references'] = []
            q['date'] = ""

            review_question(q)

            questions.append(q)
                
    processed_data[course_id] = questions

formatted_data = json.dumps(processed_data, indent=2)
formatted_data = formatted_data.replace('\n            ', '')
formatted_data = formatted_data.replace('"\n          ', '"')
print(formatted_data)