EPUB TOC

from bs4 import BeautifulSoup
import re
import argparse

parser = argparse.ArgumentParser(description="EPUB TOC")
parser.add_argument("html_name", help="HTML file extracted from EPUB")
args = parser.parse_args()

# Open and read the HTML file
with open(args.html_name, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all header tags (h1, h2, ..., h5)
# Note: h6 is reserved for notes and figures
headers = soup.find_all(re.compile('^h[1-5]$'))

# Iterate over each header and print it with indentation
for header in headers:
    # Extract header level from the tag name (e.g., 'h2' -> level 2)
    level = int(header.name[1])
    # Calculate indentation (2 spaces per level beyond 1)
    indent = '  ' * (level - 1)
    # Get the header text
    text = header.get_text(strip=True)
    print(f"{indent}{text}")