Udemy HTML TOC¶

from pathlib import Path
from bs4 import BeautifulSoup

Input and output file

INPUT_FILE = "toc.html"
OUTPUT_FILE = "toc.txt"

Remove inner spaces

def normalize_inner_spaces(text: str) -> str:
    return " ".join(text.split())

Extract TOC

def extract_toc(html_text: str) -> str:
    soup = BeautifulSoup(html_text, "html.parser")
    lines = []

    section_no = 0
    lecture_no = 0

    # Each curriculum section panel
    panels = soup.select("div[class*='curriculum-section-module'][class*='panel']")

    for panel in panels:
        # Section title
        title_el = panel.select_one("span[class*='section-title']")
        if not title_el:
            continue

        section_title = normalize_inner_spaces(title_el.get_text(" ", strip=True))
        if not section_title:
            continue

        section_no += 1
        lines.append(f"### {section_no}. {section_title}")
        lines.append("")

        # Lecture items inside the section
        item_rows = panel.select("div.ud-block-list-item-content")

        for item in item_rows:
            lecture_el = item.select_one("span[class*='course-lecture-title']")
            if lecture_el:
                lecture_title = normalize_inner_spaces(
                    lecture_el.get_text(" ", strip=True)
                )
                if lecture_title:
                    lecture_no += 1
                    lines.append(f"  {lecture_no}. {lecture_title}")

        lines.append("")

    return "\n".join(lines).rstrip() + "\n"

Main

def main():
    input_path = Path(INPUT_FILE)
    output_path = Path(OUTPUT_FILE)

    html_text = input_path.read_text(encoding="utf-8")
    toc_text = extract_toc(html_text)
    output_path.write_text(toc_text, encoding="utf-8")

    print(f"Created {output_path}")


if __name__ == "__main__":
    main()

Udemy HTML TOC¶

lit

Navigation

Related Topics