Udemy HTML TOC
==============
::
from pathlib import Path
from bs4 import BeautifulSoup
Input and output file
::
INPUT_FILE = "toc.html"
OUTPUT_FILE = "toc.txt"
Remove inner spaces
::
def normalize_inner_spaces(text: str) -> str:
return " ".join(text.split())
Extract TOC
::
def extract_toc(html_text: str) -> str:
soup = BeautifulSoup(html_text, "html.parser")
lines = []
section_no = 0
lecture_no = 0
# Each curriculum section panel
panels = soup.select("div[class*='curriculum-section-module'][class*='panel']")
for panel in panels:
# Section title
title_el = panel.select_one("span[class*='section-title']")
if not title_el:
continue
section_title = normalize_inner_spaces(title_el.get_text(" ", strip=True))
if not section_title:
continue
section_no += 1
lines.append(f"### {section_no}. {section_title}")
lines.append("")
# Lecture items inside the section
item_rows = panel.select("div.ud-block-list-item-content")
for item in item_rows:
lecture_el = item.select_one("span[class*='course-lecture-title']")
if lecture_el:
lecture_title = normalize_inner_spaces(
lecture_el.get_text(" ", strip=True)
)
if lecture_title:
lecture_no += 1
lines.append(f" {lecture_no}. {lecture_title}")
lines.append("")
return "\n".join(lines).rstrip() + "\n"
Main
::
def main():
input_path = Path(INPUT_FILE)
output_path = Path(OUTPUT_FILE)
html_text = input_path.read_text(encoding="utf-8")
toc_text = extract_toc(html_text)
output_path.write_text(toc_text, encoding="utf-8")
print(f"Created {output_path}")
if __name__ == "__main__":
main()