from bs4 import BeautifulSoup
import re
import argparse
parser = argparse.ArgumentParser(description="EPUB TOC")
parser.add_argument("html_name", help="HTML file extracted from EPUB")
args = parser.parse_args()
# Open and read the HTML file
with open(args.html_name, 'r', encoding='utf-8') as file:
html_content = file.read()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Find all header tags (h1, h2, ..., h5)
# Note: h6 is reserved for notes and figures
headers = soup.find_all(re.compile('^h[1-5]$'))
# Iterate over each header and print it with indentation
for header in headers:
# Extract header level from the tag name (e.g., 'h2' -> level 2)
level = int(header.name[1])
# Calculate indentation (2 spaces per level beyond 1)
indent = ' ' * (level - 1)
# Get the header text
text = header.get_text(strip=True)
print(f"{indent}{text}")