GCP Courses Builder
Contents
GCP Courses Builder#
import os
import glob
import json
import re
import shutil
import string
import datetime
from bs4 import BeautifulSoup
from multiprocessing import Pool
root_name = 'gcp_courses'
root_dir = '/home/eavelar/dev/eavelardev.github.io'
root_path = f"{root_dir}/{root_name}"
toc_path = f"{root_dir}/_toc.yml"
with open('../data/gcp_courses.json') as f:
courses_data = json.load(f)
punctuation = string.punctuation + '’'
def get_item_id(resource, size):
item = re.sub('[%s ]+' % re.escape(punctuation), '_', resource).lower()
keywords = {
'introduction': 'intro',
'computer_vision': 'cv',
'convolutional_neural_networks': 'cnns',
'convolutional_neural_network': 'cnn',
'deep_neural_networks': 'dnns',
'deep_neural_network': 'dnn',
'neural_networks': 'nns',
'neural_network': 'nn',
'google_cloud': 'gcp',
'bigquery': 'bq',
'tensorflow': 'tf',
'machine_learning': 'ml',
'natural_language_processing': 'nlp'
}
for key, value in keywords.items():
item = item.replace(key, value)
item = item[:size]
if item.endswith('_'):
item = item[:-1]
return item
def get_course_id(course):
return get_item_id(course, 51)
def get_module_id(module):
return get_item_id(module, 28)
def get_lesson_id(lesson):
return get_item_id(lesson, 51)
courses = {}
for course_data in courses_data:
course = get_course_id(course_data['name'])
courses[course] = course_data.copy()
courses[course]['modules'] = {}
for module_data in course_data['modules']:
module = get_module_id(module_data['name'])
courses[course]['modules'][module] = module_data.copy()
courses[course]['modules'][module]['lessons'] = {}
for lesson_data in module_data['lessons']:
lesson = get_lesson_id(lesson_data['name'])
courses[course]['modules'][module]['lessons'][lesson] = lesson_data
def get_courses_path(exist=True):
for course in courses.keys():
course_path = f"{root_path}/{course}"
if os.path.exists(course_path) == exist:
yield course_path
def get_modules_path(exist=True):
for course, course_data in courses.items():
for module in course_data['modules'].keys():
module_path = f"{root_path}/{course}/{module}"
if os.path.exists(module_path) == exist:
yield module_path
def get_lessons_path(path, file_ext):
for course, course_data in courses.items():
for module, module_data in course_data['modules'].items():
for lesson in module_data['lessons'].keys():
lesson_path = f"{path}/{course}/{module}/{lesson}.{file_ext}"
if os.path.exists(lesson_path):
yield lesson_path
def get_workdirs_path(exist=True):
for course in get_courses_path():
working_path = f"{course}/working_dir"
if os.path.exists(working_path) == exist:
yield working_path
def get_lab_url(id):
return f"https://www.cloudskillsboost.google/course_sessions/{id}"
def ge_labs_url():
for course, course_data in courses.items():
for module, module_data in course_data['modules'].items():
for lesson, lesson_data in module_data['lessons'].items():
if lesson_data['type'] == 'lab':
tag = f'{course} / {module} / {lesson}'
yield tag, get_lab_url(lesson_data['id'])
def get_videos():
for course_data in courses_data:
for module_data in course_data['modules']:
for lesson_data in module_data['lessons']:
if lesson_data['type'] == 'video':
yield lesson_data
def check_lab_url():
import requests
cookie = {
'_cvl-4_1_14_session': ''
}
for lab_tag, lab_url in ge_labs_url():
response = requests.get(lab_url, cookies=cookie)
if response.status_code == 500:
print(f'{lab_tag} - {lab_url}')
# check_lab_url()
def check_videos():
import ytb
youtube = ytb.authenticate()
for video in get_videos():
response = ytb.get_video_details(youtube, id=video['id'])
if not response:
print(f"{video['id']} - {video['name']}")
continue
# update date
video['date'] = str(datetime.date.fromisoformat(response['publish_time'][:10]))
with open('../data/gcp_courses.json', 'w') as f:
f.write(json.dumps(courses_data, indent=4))
# check_videos()
def rename_resource():
for item_path in get_lessons_path():
course, module, lesson = item_path.split('/')[-3:]
item_name = courses[course]['modules'][module]['lessons'][lesson]['name']
new_item_id = get_item_id(item_name, 51)
new_item_path = f"{root_path}/{course}/{module}/{new_item_id}"
# print(f'{lesson} -> {new_item_id}')
# os.rename(item_path, new_item_path)
for item_path in get_modules_path():
course, module = item_path.split('/')[-2:]
item_name = courses[course]['modules'][module]['name']
new_item_id = get_item_id(item_name, 28)
new_item_path = f"{root_path}/{course}/{new_item_id}"
# print(f'{module} -> {new_item_id}')
# os.rename(item_path, new_item_path)
for item_path in get_courses_path():
course = os.path.basename(item_path)
item_name = courses[course]['name']
new_item_id = get_item_id(item_name, 51)
new_item_path = f"{root_path}/{new_item_id}"
# print(f'{course} -> {new_item_id}')
# os.rename(item_path, new_item_path)
# rename_resource()
Create modules dir
for module_path in get_modules_path(exist=False):
os.makedirs(module_path)
Create Working dirs#
for working_path in get_workdirs_path(exist=False):
os.mkdir(working_path)
with open(f"{working_path}/index.md", 'w') as f:
f.write("# Working dir\n")
Copy lessons to the respective module
Create slides#
slides_data = {}
for course, course_data in courses.items():
slides_course = {}
for module, module_data in course_data['modules'].items():
module_path = f"{root_path}/{course}/{module}"
if not os.path.exists(module_path):
continue
slides_module = {}
for lesson, lesson_data in module_data['lessons'].items():
lesson_path = f"{module_path}/{lesson}.md"
if not os.path.exists(lesson_path):
continue
with open(lesson_path) as f:
data = f.read()
soup = BeautifulSoup(data, 'html.parser')
slides = soup.find_all('div', 'slides')
slides_module[lesson] = []
for slide in slides:
img = slide.div.text.strip().split('\n')[0].split('/')[-1]
text = slide.div.div.div.text.strip()
slides_module[lesson].append({
'name': img,
'data': text
})
if slides_module:
slides_course[module] = slides_module
if slides_course:
slides_data[course] = slides_course
imgs_path = '../../../images/gcp_courses'
for course, course_data in slides_data.items():
for module, module_data in course_data.items():
for lesson, lesson_data in module_data.items():
lesson_dir = f"{course}/{module}/{lesson}"
md_path = f"{root_path}/{lesson_dir}.md"
title = courses[course]['modules'][module]['lessons'][lesson]['name']
lesson_content =\
f"""\
# {title}
<aside class="margin sidebar">
::::{'{grid}'}
:::{'{grid-item}'}
:::
:::{'{grid-item}'}
<div id="slide-controls" class="btn-toolbar justify-content-between">
<button id="arrow_back" class="sd-btn">{'{material-regular}'}`arrow_back_ios;1.2em`</button>
<button id="arrow_forward" class="sd-btn">{'{material-regular}'}`arrow_forward_ios;1.2em`</button>
</div>
:::
::::
</aside>
"""
for i, img in enumerate(lesson_data):
img_path = f"{imgs_path}/{lesson_dir}/{img['name']}"
slide_num = f"{i+1}/{len(lesson_data)}"
title_img = f"{slide_num} {title}"
lesson_content +=\
f"""\
<div class="slides">
<div>
```{'{image}'} {img_path}
:alt: "{title_img}"
:class: slide-img
```
<div class="cell tag_remove-input tag_output_scroll docutils container">
<div class="cell_output docutils container">
{img['data']}
</div>
</div>
</div>
</div>
"""
with open(md_path, 'w') as f:
f.write(lesson_content)
Create indexs
def get_html_link(url, text):
link = f'<a class="reference external" href="{url}" target="_blank">{text}</a>'
return link
def get_course_link(course_num, text):
url = f"https://www.cloudskillsboost.google/course_templates/{course_num}"
link = get_html_link(url, text)
return link
def get_video_link(id, text):
url = f"https://www.youtube.com/watch?v={id}"
link = get_html_link(url, text)
return link
def get_lab_link(id, text):
url = get_lab_url(id)
link = get_html_link(url, text)
return link
def get_nb_link(nb_source, text):
repo, nb_path = nb_source.split('/', 1)
url = f'https://github.com/GoogleCloudPlatform/{repo}/blob/master/{nb_path}'
link = get_html_link(url, text)
return link
def create_index(file_path, content):
with open(file_path, 'w') as f:
f.write(content)
def get_module_toc(course, module, module_data):
title = module_data['name']
module_info = f"# {title}\n"
course_info = f"\n## {title}\n"
for lesson, values in module_data['lessons'].items():
lesson_name = values['name']
lesson_path = f"{root_path}/{course}/{module}/{lesson}.md"
if os.path.exists(lesson_path):
lesson_module = f"**[{lesson_name}]({lesson}.md)**"
lesson_curse = f"**[{lesson_name}]({module}/{lesson}.md)**"
else:
lesson_module = lesson_name
lesson_curse = lesson_name
info = ""
if values['type'] == 'video':
video_url = get_video_link(values['id'], 'Video')
date = datetime.date.fromisoformat(values['date']).strftime('%b %-d, %Y')
module_info += f"\n[{video_url}] - {lesson_module} - {date}\n"
course_info += f"\n[{video_url}] - {lesson_curse} - {date}\n"
elif values['type'] == 'lab':
lab_url = get_lab_link(values['id'], 'Lab')
info += f"\n[{lab_url}] - **{lesson_name}**\n"
nb_lab = values['nb_lab']
if nb_lab != '':
nb_lab_url = get_nb_link(nb_lab, 'lab')
nb_name = os.path.splitext(os.path.basename(nb_lab))[0]
nb_sol = values['nb_sol']
nb_sol_url = get_nb_link(nb_sol, 'sol') if nb_sol else ''
sol_info = f", {nb_sol_url}" if nb_sol_url else ''
info += f"* `{nb_name}` - {nb_lab_url}{sol_info}\n"
elif values['type'] == 'doc':
info += f"\n[{get_html_link(values['url'], 'Doc')}] - {lesson_name}\n"
if values.get('references', False):
info += "\nReferences:\n"
for ref in values['references']:
info += f"* {ref}\n"
module_info += info
course_info += info
return module_info, course_info
for course, course_data in courses.items():
course_toc = f"# {course_data['name']}\n\n"
course_toc += f"[{get_course_link(course_data['num'], 'Course')}]\n"
for module, module_data in course_data['modules'].items():
module_info, course_info = get_module_toc(course, module, module_data)
course_toc += course_info
module_path = f"{root_path}/{course}/{module}"
if os.path.exists(module_path):
index_path = f"{module_path}/index.md"
create_index(index_path, module_info)
index_path = f"{module_path}/README.md"
create_index(index_path, module_info)
index_path = f"{root_path}/{course}/index.md"
create_index(index_path, course_toc)
index_path = f"{root_path}/{course}/README.md"
create_index(index_path, course_toc)
Create toc yaml
def valid_section(path):
for element in os.listdir(path):
if '.' not in element:
return True
elif (element.endswith('.ipynb') or element.endswith('.md')):
if element != 'index.md' and element != 'README.md':
return True
return False
def is_index_empty(file_path):
with open(file_path) as f:
f.readline()
if f.readline() == '':
return True
else:
return False
# Remove empty modules
for module_path in get_modules_path():
if len(os.listdir(module_path)) <= 1:
shutil.rmtree(module_path)
for working_path in get_workdirs_path():
index = f'{working_path}/index.md'
index_empty = is_index_empty(index)
if index_empty and len(os.listdir(working_path)) <= 1:
shutil.rmtree(working_path)
yaml = \
f"""\
format: jb-book
root: index
chapters:
- file: elearning_ml_data/index
sections:
- glob: elearning_ml_data/*
- file: mldev_tools/index
sections:
- glob: mldev_tools/*
- file: dev_notes/index
sections:
- glob: dev_notes/*
- file: ml_notes/index
sections:
- glob: ml_notes/*
- file: {root_name}/index
sections:"""
for course, course_data in courses.items():
course_dir = f"{root_name}/{course}"
course_path = f"{root_path}/{course}"
yaml += \
f"""
- file: {course_dir}/index"""
if valid_section(course_path):
yaml += \
f"""
sections:"""
for module, module_data in course_data['modules'].items():
module_dir = f"{course_dir}/{module}"
module_path = f"{course_path}/{module}"
if os.path.exists(module_path):
yaml += \
f"""
- file: {module_dir}/index"""
if valid_section(module_path):
yaml += \
f"""
sections:"""
for lesson in module_data['lessons'].keys():
lesson_md_dir = f"{module_dir}/{lesson}"
lesson_md_path = f"{module_path}/{lesson}.md"
if os.path.exists(lesson_md_path):
yaml += \
f"""
- file: {lesson_md_dir}"""
working_dir = f"{course_dir}/working_dir"
working_path = f"{course_path}/working_dir"
if os.path.exists(working_path):
yaml += \
f"""
- file: {working_dir}/index"""
if valid_section(working_path):
yaml += \
f"""
sections:
- glob: {working_dir}/*"""
yaml += \
"""
- file: gcp_official/index
sections:
- glob: gcp_official/*
- file: gcp_resources/index
sections:
- glob: gcp_resources/*
- file: tf_official/index
sections:
- file: tf_official/structured_data/index
sections:
- glob: tf_official/structured_data/*
- file: tf_official/distribute/index
sections:
- glob: tf_official/distribute/*
- file: tf_resources/index
sections:
- glob: tf_resources/*
- file: feature_eng/index
- file: deeplearningai/index
sections:
- file: deeplearningai/tf_dev_course_original/index
sections:
- glob: deeplearningai/tf_dev_course_original/*
- file: deeplearningai/tf_dev_course_updated/index
sections:
- glob: deeplearningai/tf_dev_course_updated/*
"""
with open(toc_path, 'w') as f:
f.write(yaml)
Update Indexes
indexes = glob.glob('../**/README.md', recursive=True)
for i in indexes:
dirname = os.path.dirname(i)
new_path = os.path.join(dirname, 'index.md')
shutil.copy(i, new_path)
Generate extra html you want to integrate
jupyter-book build .
Remove visual elements
def clean_html(file_path, soup, slide):
# Powered by Jupyter Book
tag = soup.find('div', 'bd-sidebar__bottom')
if tag:
tag.decompose()
if not slide:
# Remove header icons
tag_header = soup.find('div', 'header-article__right')
tag_header.clear()
# Remove footer, copyright
tag = soup.find('div', 'footer-content row')
tag.decompose()
# remove previous / next texts
tags = soup.findAll('p', 'prev-next-subtitle')
for tag in tags:
tag.decompose()
with open(file_path, 'w') as f:
f.write(str(soup))
def process_slide(file_path):
with open(file_path) as f:
soup = BeautifulSoup(f.read())
# Remove download .md and print .pdf
tag = soup.find('div', 'menu-dropdown')
if tag:
tag.decompose()
# Remove toc button text
tag = soup.find('label', 'headerbtn')
tag['title'] = ''
# Change behavior of fullscreen button
tag = soup.find('button', 'headerbtn')
tag['title'] = ''
tag['onclick'] = 'presentationMode()'
# Remove h1
tag = soup.find('section', 'tex2jax_ignore')
tag.h1.decompose()
clean_html(file_path, soup, True)
def process_no_slide(file_path):
with open(file_path) as f:
soup = BeautifulSoup(f.read())
clean_html(file_path, soup, False)
with Pool(8) as p:
p.map(process_slide, get_lessons_path(f"{root_dir}/_build/html/gcp_courses", 'html'))
pattern = f"_build/html/**/*.html"
html_files = glob.glob(f"{root_dir}/{pattern}", recursive=True)
with Pool(8) as p:
p.map(process_no_slide, html_files)
Publish
ghp-import -n -p -f _build/html