GCP Courses Builder#

import os
import glob
import json
import re
import shutil
import string
import datetime
from bs4 import BeautifulSoup
from multiprocessing import Pool

root_name = 'gcp_courses'
root_dir = '/home/eavelar/dev/eavelardev.github.io'
root_path = f"{root_dir}/{root_name}"
toc_path = f"{root_dir}/_toc.yml"

with open('../data/gcp_courses.json') as f:
    courses_data = json.load(f)

punctuation = string.punctuation + '’'

def get_item_id(resource, size):
    item = re.sub('[%s ]+' % re.escape(punctuation), '_', resource).lower()

    keywords = {
        'introduction': 'intro',
        'computer_vision': 'cv',
        'convolutional_neural_networks': 'cnns',
        'convolutional_neural_network': 'cnn',
        'deep_neural_networks': 'dnns',
        'deep_neural_network': 'dnn',
        'neural_networks': 'nns',
        'neural_network': 'nn',
        'google_cloud': 'gcp',
        'bigquery': 'bq',
        'tensorflow': 'tf',
        'machine_learning': 'ml',
        'natural_language_processing': 'nlp'    
    }

    for key, value in keywords.items():
        item = item.replace(key, value)

    item = item[:size]

    if item.endswith('_'):
        item = item[:-1]

    return item

def get_course_id(course):
    return get_item_id(course, 51)

def get_module_id(module):
    return get_item_id(module, 28)

def get_lesson_id(lesson):
    return get_item_id(lesson, 51)

courses = {}

for course_data in courses_data:
    course = get_course_id(course_data['name'])
    courses[course] = course_data.copy()
    courses[course]['modules'] = {}

    for module_data in course_data['modules']:
        module = get_module_id(module_data['name'])
        courses[course]['modules'][module] = module_data.copy()
        courses[course]['modules'][module]['lessons'] = {}

        for lesson_data in module_data['lessons']:
            lesson = get_lesson_id(lesson_data['name'])
            courses[course]['modules'][module]['lessons'][lesson] = lesson_data

def get_courses_path(exist=True):
    for course in courses.keys():
        course_path = f"{root_path}/{course}"
        if os.path.exists(course_path) == exist:
            yield course_path

def get_modules_path(exist=True):
    for course, course_data in courses.items():
        for module in course_data['modules'].keys():
            module_path = f"{root_path}/{course}/{module}"
            if os.path.exists(module_path) == exist:
                yield module_path

def get_lessons_path(path, file_ext):
    for course, course_data in courses.items():
        for module, module_data in course_data['modules'].items():
            for lesson in module_data['lessons'].keys():
                lesson_path = f"{path}/{course}/{module}/{lesson}.{file_ext}"
                if os.path.exists(lesson_path):
                    yield lesson_path

def get_workdirs_path(exist=True):
    for course in get_courses_path():
        working_path = f"{course}/working_dir"
        if os.path.exists(working_path) == exist:
            yield working_path

def get_lab_url(id):
    return f"https://www.cloudskillsboost.google/course_sessions/{id}"

def ge_labs_url():
    for course, course_data in courses.items():
        for module, module_data in course_data['modules'].items():
            for lesson, lesson_data in module_data['lessons'].items():
                if lesson_data['type'] == 'lab':
                    tag = f'{course} / {module} / {lesson}'
                    yield tag, get_lab_url(lesson_data['id'])

def get_videos():
    for course_data in courses_data:
        for module_data in course_data['modules']:
            for lesson_data in module_data['lessons']:
                if lesson_data['type'] == 'video':
                    yield lesson_data
def check_lab_url():
    import requests

    cookie = {
        '_cvl-4_1_14_session': ''
    }

    for lab_tag, lab_url in ge_labs_url():
        response = requests.get(lab_url, cookies=cookie)
        if response.status_code == 500:
            print(f'{lab_tag} - {lab_url}')

# check_lab_url()
def check_videos():
    import ytb

    youtube = ytb.authenticate()

    for video in get_videos():
        response = ytb.get_video_details(youtube, id=video['id'])
        if not response:
            print(f"{video['id']} - {video['name']}")
            continue

        # update date
        video['date'] = str(datetime.date.fromisoformat(response['publish_time'][:10]))

    with open('../data/gcp_courses.json', 'w') as f:
        f.write(json.dumps(courses_data, indent=4))

# check_videos()
def rename_resource():
    for item_path in get_lessons_path():
        course, module, lesson = item_path.split('/')[-3:]
        item_name = courses[course]['modules'][module]['lessons'][lesson]['name']
        new_item_id = get_item_id(item_name, 51)
        new_item_path = f"{root_path}/{course}/{module}/{new_item_id}"
        # print(f'{lesson} -> {new_item_id}')
        # os.rename(item_path, new_item_path)

    for item_path in get_modules_path():
        course, module = item_path.split('/')[-2:]
        item_name = courses[course]['modules'][module]['name']
        new_item_id = get_item_id(item_name, 28)
        new_item_path = f"{root_path}/{course}/{new_item_id}"
        # print(f'{module} -> {new_item_id}')
        # os.rename(item_path, new_item_path)
        
    for item_path in get_courses_path():
        course = os.path.basename(item_path)
        item_name = courses[course]['name']
        new_item_id = get_item_id(item_name, 51)
        new_item_path = f"{root_path}/{new_item_id}"
        # print(f'{course} -> {new_item_id}')
        # os.rename(item_path, new_item_path)

# rename_resource()

Create modules dir

for module_path in get_modules_path(exist=False):
    os.makedirs(module_path)

Create Working dirs#

for working_path in get_workdirs_path(exist=False):
    os.mkdir(working_path)
    with open(f"{working_path}/index.md", 'w') as f:
        f.write("# Working dir\n")

Copy lessons to the respective module

Create slides#

slides_data = {}
for course, course_data in courses.items():
    slides_course = {}
    for module, module_data in course_data['modules'].items():
        module_path = f"{root_path}/{course}/{module}"
        if not os.path.exists(module_path):
            continue
        slides_module = {}
        for lesson, lesson_data in module_data['lessons'].items():
            lesson_path = f"{module_path}/{lesson}.md"
            if not os.path.exists(lesson_path):
                continue

            with open(lesson_path) as f:
                data = f.read()

            soup = BeautifulSoup(data, 'html.parser')
            slides = soup.find_all('div', 'slides')    
            slides_module[lesson] = []

            for slide in slides:
                img = slide.div.text.strip().split('\n')[0].split('/')[-1]
                text = slide.div.div.div.text.strip()    

                slides_module[lesson].append({
                    'name': img,
                    'data': text
                    }) 
                
        if slides_module:
            slides_course[module] = slides_module

    if slides_course:
        slides_data[course] = slides_course
    
imgs_path = '../../../images/gcp_courses'

for course, course_data in slides_data.items():
    for module, module_data in course_data.items():
        for lesson, lesson_data in module_data.items():
            lesson_dir = f"{course}/{module}/{lesson}"
            md_path = f"{root_path}/{lesson_dir}.md"

            title = courses[course]['modules'][module]['lessons'][lesson]['name']
            lesson_content =\
f"""\
# {title}

<aside class="margin sidebar">

::::{'{grid}'}
:::{'{grid-item}'}
:::
:::{'{grid-item}'}
<div id="slide-controls" class="btn-toolbar justify-content-between">

<button id="arrow_back" class="sd-btn">{'{material-regular}'}`arrow_back_ios;1.2em`</button>

<button id="arrow_forward" class="sd-btn">{'{material-regular}'}`arrow_forward_ios;1.2em`</button>
</div>
:::
::::
</aside>
"""
            for i, img in enumerate(lesson_data):
                img_path = f"{imgs_path}/{lesson_dir}/{img['name']}"
                slide_num = f"{i+1}/{len(lesson_data)}"
                title_img = f"{slide_num} {title}"
                lesson_content +=\
f"""\
<div class="slides">
<div>

```{'{image}'} {img_path}
:alt: "{title_img}"
:class: slide-img
```
<div class="cell tag_remove-input tag_output_scroll docutils container">
<div class="cell_output docutils container">

{img['data']}
</div>
</div>
</div>
</div>
"""
            with open(md_path, 'w') as f:
                f.write(lesson_content)

Create indexs

def get_html_link(url, text):
    link = f'<a class="reference external" href="{url}" target="_blank">{text}</a>'
    return link

def get_course_link(course_num, text):
    url = f"https://www.cloudskillsboost.google/course_templates/{course_num}"
    link = get_html_link(url, text)
    return link

def get_video_link(id, text):
    url = f"https://www.youtube.com/watch?v={id}"
    link = get_html_link(url, text)
    return link

def get_lab_link(id, text):
    url = get_lab_url(id)
    link = get_html_link(url, text)
    return link

def get_nb_link(nb_source, text):
    repo, nb_path = nb_source.split('/', 1)
    url = f'https://github.com/GoogleCloudPlatform/{repo}/blob/master/{nb_path}'
    link = get_html_link(url, text)
    return link

def create_index(file_path, content):
    with open(file_path, 'w') as f:
        f.write(content)
        
def get_module_toc(course, module, module_data):
    title = module_data['name']
    module_info = f"# {title}\n"
    course_info = f"\n## {title}\n"

    for lesson, values in module_data['lessons'].items():
        lesson_name = values['name']
        lesson_path = f"{root_path}/{course}/{module}/{lesson}.md"

        if os.path.exists(lesson_path):
            lesson_module = f"**[{lesson_name}]({lesson}.md)**"
            lesson_curse = f"**[{lesson_name}]({module}/{lesson}.md)**"
        else:
            lesson_module = lesson_name
            lesson_curse = lesson_name
        
        info = ""
        if values['type'] == 'video':
            video_url = get_video_link(values['id'], 'Video')
            date = datetime.date.fromisoformat(values['date']).strftime('%b %-d, %Y')
            module_info += f"\n[{video_url}] - {lesson_module} - {date}\n"
            course_info += f"\n[{video_url}] - {lesson_curse} - {date}\n"
        elif values['type'] == 'lab':
            lab_url = get_lab_link(values['id'], 'Lab')
            info += f"\n[{lab_url}] - **{lesson_name}**\n"
            nb_lab = values['nb_lab']

            if nb_lab != '':
                nb_lab_url = get_nb_link(nb_lab, 'lab')
                nb_name = os.path.splitext(os.path.basename(nb_lab))[0]

                nb_sol = values['nb_sol']
                nb_sol_url = get_nb_link(nb_sol, 'sol') if nb_sol else ''
                sol_info = f", {nb_sol_url}" if nb_sol_url else ''

                info += f"* `{nb_name}` - {nb_lab_url}{sol_info}\n"
        elif values['type'] == 'doc':
            info += f"\n[{get_html_link(values['url'], 'Doc')}] - {lesson_name}\n"  

        if values.get('references', False):
            info += "\nReferences:\n"
            for ref in values['references']:
                info += f"* {ref}\n"

        module_info += info
        course_info += info
    
    return module_info, course_info

for course, course_data in courses.items():
    course_toc = f"# {course_data['name']}\n\n"
    course_toc += f"[{get_course_link(course_data['num'], 'Course')}]\n"

    for module, module_data in course_data['modules'].items():
        module_info, course_info = get_module_toc(course, module, module_data)
        course_toc += course_info

        module_path = f"{root_path}/{course}/{module}"

        if os.path.exists(module_path):
            index_path = f"{module_path}/index.md"
            create_index(index_path, module_info) 
            index_path = f"{module_path}/README.md"
            create_index(index_path, module_info) 

    index_path = f"{root_path}/{course}/index.md"
    create_index(index_path, course_toc)
    index_path = f"{root_path}/{course}/README.md"
    create_index(index_path, course_toc)

Create toc yaml

def valid_section(path):
    for element in os.listdir(path):
        if '.' not in element:
            return True
        elif (element.endswith('.ipynb') or element.endswith('.md')):
            if element != 'index.md' and element != 'README.md':
                return True     
    return False

def is_index_empty(file_path):
    with open(file_path) as f:
        f.readline()
        if f.readline() == '':
            return True
        else:
            return False 
      
# Remove empty modules
for module_path in get_modules_path():
    if len(os.listdir(module_path)) <= 1:
        shutil.rmtree(module_path)

for working_path in get_workdirs_path():
    index = f'{working_path}/index.md'
    index_empty = is_index_empty(index)

    if index_empty and len(os.listdir(working_path)) <= 1:
        shutil.rmtree(working_path)

yaml = \
f"""\
format: jb-book
root: index
chapters:
- file: elearning_ml_data/index
  sections:
  - glob: elearning_ml_data/*
- file: mldev_tools/index
  sections:
  - glob: mldev_tools/*
- file: dev_notes/index
  sections:
  - glob: dev_notes/*
- file: ml_notes/index
  sections:
  - glob: ml_notes/*
- file: {root_name}/index
  sections:"""

for course, course_data in courses.items():
    course_dir = f"{root_name}/{course}"
    course_path = f"{root_path}/{course}"
    yaml += \
f"""
  - file: {course_dir}/index"""
    if valid_section(course_path):
        yaml += \
f"""
    sections:"""

    for module, module_data in course_data['modules'].items():
        module_dir = f"{course_dir}/{module}"
        module_path = f"{course_path}/{module}"
        if os.path.exists(module_path):
            yaml += \
f"""
    - file: {module_dir}/index"""
    
            if valid_section(module_path):
                yaml += \
f"""
      sections:"""

        for lesson in module_data['lessons'].keys():
            lesson_md_dir = f"{module_dir}/{lesson}"
            lesson_md_path = f"{module_path}/{lesson}.md"
            if os.path.exists(lesson_md_path):
                yaml += \
f"""
      - file: {lesson_md_dir}"""

    working_dir = f"{course_dir}/working_dir"
    working_path = f"{course_path}/working_dir"
    if os.path.exists(working_path):
        yaml += \
f"""
    - file: {working_dir}/index"""
        if valid_section(working_path):
            yaml += \
f"""
      sections:
      - glob: {working_dir}/*"""
             

yaml += \
"""
- file: gcp_official/index
  sections:
  - glob: gcp_official/*
- file: gcp_resources/index
  sections:
  - glob: gcp_resources/*
- file: tf_official/index
  sections:
  - file: tf_official/structured_data/index
    sections:
    - glob: tf_official/structured_data/*
  - file: tf_official/distribute/index
    sections:
    - glob: tf_official/distribute/*
- file: tf_resources/index
  sections:
  - glob: tf_resources/*
- file: feature_eng/index
- file: deeplearningai/index
  sections:
  - file: deeplearningai/tf_dev_course_original/index
    sections:
    - glob: deeplearningai/tf_dev_course_original/*
  - file: deeplearningai/tf_dev_course_updated/index
    sections:
    - glob: deeplearningai/tf_dev_course_updated/*
"""

with open(toc_path, 'w') as f:
     f.write(yaml)

Update Indexes

indexes = glob.glob('../**/README.md', recursive=True)

for i in indexes:
    dirname = os.path.dirname(i)
    new_path = os.path.join(dirname, 'index.md')
    shutil.copy(i, new_path)

Generate extra html you want to integrate

jupyter-book build .

Remove visual elements

def clean_html(file_path, soup, slide): 
    # Powered by Jupyter Book
    tag = soup.find('div', 'bd-sidebar__bottom')

    if tag:
        tag.decompose()

        if not slide:
            # Remove header icons
            tag_header = soup.find('div', 'header-article__right')
            tag_header.clear()
            
        # Remove footer, copyright
        tag = soup.find('div', 'footer-content row')
        tag.decompose()

        # remove previous / next texts
        tags = soup.findAll('p', 'prev-next-subtitle')
        for tag in tags:
            tag.decompose()

        with open(file_path, 'w') as f:
            f.write(str(soup))

def process_slide(file_path):
    with open(file_path) as f:
        soup = BeautifulSoup(f.read())

    # Remove download .md and print .pdf
    tag = soup.find('div', 'menu-dropdown')
    
    if tag:
        tag.decompose()

        # Remove toc button text
        tag = soup.find('label', 'headerbtn')
        tag['title'] = ''

        # Change behavior of fullscreen button
        tag = soup.find('button', 'headerbtn')
        tag['title'] = ''
        tag['onclick'] = 'presentationMode()'

        # Remove h1
        tag = soup.find('section', 'tex2jax_ignore')
        tag.h1.decompose()

        clean_html(file_path, soup, True)

def process_no_slide(file_path):
    with open(file_path) as f:
        soup = BeautifulSoup(f.read())

    clean_html(file_path, soup, False)
    
with Pool(8) as p:
    p.map(process_slide, get_lessons_path(f"{root_dir}/_build/html/gcp_courses", 'html'))

pattern = f"_build/html/**/*.html"
html_files = glob.glob(f"{root_dir}/{pattern}", recursive=True)

with Pool(8) as p:
    p.map(process_no_slide, html_files)

Publish

ghp-import -n -p -f _build/html