Udemy
=====
Summarize Udemy Transcript.
Similar to `Obsidian-AI`_, but directly from the book folder.
Also can remove newlines from Udemy transcripts.
.. _Obsidian-AI: ai_obsidian.py.html
.. csv-table:: Useful Links
:header: "Name", "URL"
:widths: 10 30
"Session State", https://docs.streamlit.io/develop/api-reference/caching-and-state/st.session_state
"How to count tokens with tiktoken", https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
"Model Pricing", https://platform.openai.com/docs/pricing#latest-models
.. contents::
::
import streamlit as st
import yaml
import os
import re
import tiktoken
from openai import OpenAI
import platform
import pyperclip
import time
import subprocess
See: PersistedList_
.. _PersistedList: PersistedList.py.html
::
from PersistedList import PersistedList
Print banner.
::
st.set_page_config(
page_title="Udemy"
)
@st.cache_data
def print_banner():
print("""
_____ _____ __
|_ _||_ _| | ]
| | | | .--.| | .---. _ .--..--. _ __
| ' ' |/ /'`\\' |/ /__\\\\[ `.-. .-. | [ \\ [ ]
\\ \\__/ / | \\__/ || \\__., | | | | | | \\ '/ /
`.__.' '.__.;__]'.__.'[___||__||__][\\_: /
\\__.'
""")
return 1
print_banner()
st.logo("https://ea-books.netlify.app/lit/udemy.svg")
Select OpenAI LLM.
::
llm_prices = {
"o4-mini": 1.10,
"o3-mini": 1.10,
"o3": 2.0,
"o3-pro": 20.0,
"gemini-2.5-flash-preview-05-20": 0.0,
"gemma-3-27b-it": 0.0,
"gemini-2.0-flash": 0.0,
"gpt-4.1-mini": 0.4,
"gpt-4.1-nano": 0.1,
"gpt-4.1": 2.0,
"gpt-4o-mini": 0.15,
"gpt-4o": 2.5,
}
llm_temperature = 0.1
def get_llm_properties(llm_model):
if llm_model.startswith("gemini"):
return {"google": True, "temperature": True, "xml": False}
elif llm_model.startswith("gemma"):
return {"google": True, "temperature": True, "xml": True}
elif llm_model.startswith("gpt"):
return {"google": False, "temperature": True, "xml": False}
else: #o3
return {"google": False, "temperature": False, "xml": False}
def reset_execution_time():
if "execution_time" in st.session_state:
del st.session_state["execution_time"]
Remember which LLM was used last time.
::
llm_models = list(llm_prices.keys())
llm_models_persisted = PersistedList(".udemy")
llm_models = llm_models_persisted.sort_by_pattern(llm_models)
llm_model = st.sidebar.selectbox(
"LLM Models",
llm_models,
on_change=reset_execution_time
)
Obsidian folder
---------------
Find the Obsidian folder, which is the first subfolder within the current folder that has a name ending with " Book".
::
current_folder = os.getcwd()
home_folders = os.listdir(current_folder)
book_folders = [item for item in home_folders if os.path.isdir(os.path.join(current_folder, item)) and item.endswith(" Book")]
if (len(book_folders)==0):
st.error('The folder should contain a subfolder with a name that ends with " Book".')
st.stop()
note_home = book_folders[0]
# print("OBSIDIAN_HOME: " + note_home)
Output file to save response.
::
home_directory = os.path.expanduser("~")
output_folder = os.path.join(home_directory, ".a-services")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
out_file = os.path.join(output_folder, 'udemy.txt')
adoc_file = os.path.join(output_folder, 'udemy.adoc')
OpenAI and Gemini clients
::
client = OpenAI()
g_key = os.getenv("GEMINI_API_KEY")
g_client = OpenAI(
api_key=g_key,
base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)
Newest files
------------
Get ``num_files`` newest files from the provided ``directory``.
::
def get_newest_files(directory, num_files):
# Check if the directory exists
if not os.path.isdir(directory):
raise ValueError(f"The directory {directory} does not exist.")
# Get a list of files in the directory with their full paths and modification times
files_with_paths = []
for file_name in os.listdir(directory):
file_path = os.path.join(directory, file_name)
if os.path.isfile(file_path):
files_with_paths.append((file_path, os.path.getmtime(file_path)))
# Sort files by modification time in descending order (newest first)
sorted_files = sorted(files_with_paths, key=lambda x: x[1], reverse=True)
# Extract the num_files newest file names
newest_files = [os.path.basename(file_with_path[0]) for file_with_path in sorted_files[:num_files]]
return newest_files
Select ``note_name`` from 5 newest notes.
::
newest_files = get_newest_files(note_home, 5)
note_name = st.selectbox(
"Note",
newest_files,
)
Load Obsidian note
------------------
::
file_path = os.path.join(note_home, note_name)
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
Write truncated input text
::
# Truncate text to max len
def max_len(text, k):
if len(text) <= k:
return text
return text[:k] + '...'
st.write(f"""
{max_len(text, 250)}
""")
Tokens & price
--------------
::
tiktoken_model = "o200k_base"
#encoding = tiktoken.get_encoding(tiktoken_model)
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
tokens = encoding.encode(text)
Calculate price in cents.
::
cents = round(len(tokens) * llm_prices[llm_model]/10000, 5)
st.sidebar.write(f'''
| Chars | Tokens | Cents |
|---|---|---|
| {len(text)} | {len(tokens)} | {cents} |
''')
st.sidebar.divider()
Buttons to update text
----------------------
- Replace newlines with spaces, and
- Remove empty lines from text
::
def remove_empty_lines_and_leading_hyphens(text):
lines = text.splitlines()
non_empty_lines = [line for line in lines if line.strip()]
# Remove leading hyphens
stripped = [
line[1:].lstrip() if line.startswith('-') else line
for line in non_empty_lines
]
cleaned_text = '\n'.join(stripped)
return cleaned_text
def replace_newlines_with_spaces(input_string):
# An inexpensive method to remove empty lines without using extra logic such as leading hyphens.
return input_string.replace('\n', ' ')
if st.button(':small_red_triangle_down: **Replace newlines with spaces**', use_container_width=True):
text = remove_empty_lines_and_leading_hyphens(text)
with open(file_path, 'w', encoding='utf-8') as file:
file.write(text)
st.rerun()
Call OpenAI API
---------------
::
prompt_summarize = """You will be provided with statements in markdown,
and your task is to summarize the content you are provided.
"""
prompt_improve = """You will be provided with statements in markdown,
and your task is to improve the content you are provided.
"""
prompt_questions = """
You will be provided with context in markdown,
and your task is to generate 3 questions this context can provide
specific answers to which are unlikely to be found elsewhere.
Higher-level summaries of surrounding context may be provided
as well. Try using these summaries to generate better questions
that this context can answer.
"""
if 'openai_result' not in st.session_state:
st.session_state.openai_result = ""
def call_llm(text, prompt):
llm_models_persisted.select(llm_model)
props = get_llm_properties(llm_model)
llm_client = g_client if props["google"] else client
if props["xml"]:
messages = [
{"role": "user", "content": f"{prompt}\n{text}"},
]
else:
messages = [
{"role": "developer", "content": prompt},
{"role": "user", "content": text},
]
if props["temperature"]:
response = llm_client.chat.completions.create(
model=llm_model,
messages=messages,
temperature=llm_temperature,
)
else:
response = llm_client.chat.completions.create(
model=llm_model,
messages=messages,
)
choice = response.choices[0]
out_text = choice.message.content
st.session_state.openai_result = out_text
st.write(st.session_state.openai_result)
with open(out_file, 'w') as file:
file.write(out_text)
st.sidebar.write(f'Response saved: `{out_file}`')
if platform.system() == 'Darwin':
os.system("afplay /System/Library/Sounds/Glass.aiff")
Show OpenAI result.
::
# st.write('---')
st.write(st.session_state.openai_result)
# st.write('---')
if st.sidebar.button(':sparkles: Summarize', use_container_width=True):
start_time = time.time()
call_llm(text, prompt_summarize)
end_time = time.time()
st.session_state.execution_time = end_time - start_time
st.rerun()
if st.sidebar.button(':question: Ask questions', use_container_width=True):
start_time = time.time()
call_llm(text, prompt_questions)
end_time = time.time()
st.session_state.execution_time = end_time - start_time
st.rerun()
if st.sidebar.button(':thumbsup: Improve', use_container_width=True):
start_time = time.time()
call_llm(text, prompt_improve)
end_time = time.time()
st.session_state.execution_time = end_time - start_time
st.rerun()
Convert to Asciidoc
::
def convert_to_asciidoc(markdown):
subprocess.run(["pandoc", "-f", "gfm", "-s", out_file, "-o", adoc_file], check=True)
with open(adoc_file, "r", encoding="utf-8") as fin:
result = fin.read()
return result
Copy to clipboard
::
if len(st.session_state.openai_result) > 0:
if st.sidebar.button(':clipboard: Copy to clipboard', type='primary', use_container_width=True):
pyperclip.copy(st.session_state.openai_result)
st.toast(f'Copied to clipboard')
Copy Asciidoc to clipboard
::
def bump_headers(text: str, n: int) -> str:
"""Add n '=' characters to the start of each AsciiDoc header line."""
if n == 0:
return text
prefix = '=' * n
# Match lines starting with one or more '=' but not lines with only '=' (adornments)
pattern = re.compile(r'^(=+)(?=\s)', re.MULTILINE)
return pattern.sub(lambda m: prefix + m.group(1), text)
def asciidoc_headers(content):
# This will remove the entire line if it matches, including the newline.
cleaned_content = re.sub(r'^\[\[.*?\]\]\s*\n', '', content, flags=re.MULTILINE)
return cleaned_content
bump_headers_n = st.sidebar.number_input("Bump headers", value=0, min_value=0)
if len(st.session_state.openai_result) > 0:
if st.sidebar.button(':clipboard: Copy Asciidoc to clipboard', type='primary', use_container_width=True):
pyperclip.copy(asciidoc_headers(bump_headers(convert_to_asciidoc(st.session_state.openai_result), bump_headers_n)))
st.toast(f'Copied to clipboard')
Show last execution time
::
if "execution_time" in st.session_state:
st.sidebar.write(f"Execution time: `{round(st.session_state.execution_time, 2)}` sec")