Udemy
=====

Summarize Udemy Transcript.

Similar to `Obsidian-AI`_, but directly from the book folder. 
Also can remove newlines from Udemy transcripts.


.. _Obsidian-AI: ai_obsidian.py.html

.. csv-table:: Useful Links
   :header: "Name", "URL"
   :widths: 10 30

   "Session State", https://docs.streamlit.io/develop/api-reference/caching-and-state/st.session_state
   "How to count tokens with tiktoken", https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
   "Model Pricing", https://platform.openai.com/docs/pricing#latest-models

.. contents::
 
::

  import streamlit as st
  import yaml
  import os
  import re
  import tiktoken
  from openai import OpenAI
  import platform
  import pyperclip
  import time
  import subprocess

See: PersistedList_

.. _PersistedList: PersistedList.py.html
  
::

  from PersistedList import PersistedList

Print banner.

::

  st.set_page_config(
      page_title="Udemy"
  )

  @st.cache_data
  def print_banner():
      print("""
       _____  _____      __                                       
      |_   _||_   _|    |  ]                                      
        | |    | |  .--.| | .---.  _ .--..--.    _   __           
        | '    ' |/ /'`\\' |/ /__\\\\[ `.-. .-. |  [ \\ [  ]      
         \\ \\__/ / | \\__/  || \\__., | | | | | |   \\ '/ /      
          `.__.'   '.__.;__]'.__.'[___||__||__][\\_:  /           
                                                \\__.'                  
      """)
      return 1

  print_banner()
  st.logo("https://ea-books.netlify.app/lit/udemy.svg")


Select OpenAI LLM.

::

  llm_prices = {
      "o4-mini": 1.10,
      "o3-mini": 1.10,
      "o3": 2.0,
      "o3-pro": 20.0,
    
      "gemini-2.5-flash-preview-05-20": 0.0,
      "gemma-3-27b-it": 0.0,
      "gemini-2.0-flash": 0.0,
    
      "gpt-4.1-mini": 0.4,
      "gpt-4.1-nano": 0.1,
      "gpt-4.1": 2.0,
      "gpt-4o-mini": 0.15,
      "gpt-4o": 2.5,
  }

  llm_temperature = 0.1

  def get_llm_properties(llm_model):
      if llm_model.startswith("gemini"):
          return {"google": True, "temperature": True, "xml": False}

      elif llm_model.startswith("gemma"):
          return {"google": True, "temperature": True, "xml": True}

      elif llm_model.startswith("gpt"):
          return {"google": False, "temperature": True, "xml": False}

      else: #o3
          return {"google": False, "temperature": False, "xml": False}
        
  def reset_execution_time():
      if "execution_time" in st.session_state:
          del st.session_state["execution_time"]

Remember which LLM was used last time.

::

  llm_models = list(llm_prices.keys())
  llm_models_persisted = PersistedList(".udemy")
  llm_models = llm_models_persisted.sort_by_pattern(llm_models)

  llm_model = st.sidebar.selectbox(
      "LLM Models", 
      llm_models,
      on_change=reset_execution_time
  )

Obsidian folder
---------------

Find the Obsidian folder, which is the first subfolder within the current folder that has a name ending with " Book".

::

  current_folder = os.getcwd()
  home_folders = os.listdir(current_folder)
  book_folders = [item for item in home_folders if os.path.isdir(os.path.join(current_folder, item)) and item.endswith(" Book")]

  if (len(book_folders)==0):
      st.error('The folder should contain a subfolder with a name that ends with " Book".')
      st.stop()
  
  note_home =  book_folders[0]
  # print("OBSIDIAN_HOME: " + note_home)

Output file to save response.

::

  home_directory = os.path.expanduser("~")
  output_folder = os.path.join(home_directory, ".a-services")
  if not os.path.exists(output_folder):
      os.makedirs(output_folder)
    
  out_file = os.path.join(output_folder, 'udemy.txt')
  adoc_file = os.path.join(output_folder, 'udemy.adoc')

OpenAI and Gemini clients

::  

  client = OpenAI()

  g_key = os.getenv("GEMINI_API_KEY")
  g_client = OpenAI(
      api_key=g_key,
      base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
  )

Newest files 
------------

Get ``num_files`` newest files from the provided ``directory``.

::
    
  def get_newest_files(directory, num_files):
      # Check if the directory exists
      if not os.path.isdir(directory):
          raise ValueError(f"The directory {directory} does not exist.")

      # Get a list of files in the directory with their full paths and modification times
      files_with_paths = []
      for file_name in os.listdir(directory):
          file_path = os.path.join(directory, file_name)
          if os.path.isfile(file_path):
              files_with_paths.append((file_path, os.path.getmtime(file_path)))

      # Sort files by modification time in descending order (newest first)
      sorted_files = sorted(files_with_paths, key=lambda x: x[1], reverse=True)

      # Extract the num_files newest file names
      newest_files = [os.path.basename(file_with_path[0]) for file_with_path in sorted_files[:num_files]]

      return newest_files

Select ``note_name`` from 5 newest notes.

::

  newest_files = get_newest_files(note_home, 5)
  note_name = st.selectbox(
     "Note",
     newest_files,
  )

Load Obsidian note
------------------

::

  file_path = os.path.join(note_home, note_name)
  with open(file_path, 'r', encoding='utf-8') as file:
      text = file.read()

 
Write truncated input text

::
    
  # Truncate text to max len
  def max_len(text, k):
      if len(text) <= k:
          return text
      return text[:k] + '...'  

  st.write(f"""
 
  {max_len(text, 250)}
 
  """)

Tokens & price
--------------

::


  tiktoken_model = "o200k_base"
  #encoding = tiktoken.get_encoding(tiktoken_model) 
  encoding = tiktoken.encoding_for_model("gpt-4o-mini")
  tokens = encoding.encode(text)
  
Calculate price in cents.

::

  cents = round(len(tokens) * llm_prices[llm_model]/10000, 5)

  st.sidebar.write(f'''
      | Chars | Tokens | Cents |
      |---|---|---|
      | {len(text)} | {len(tokens)} | {cents} |
      ''')  
       
st.sidebar.divider()


Buttons to update text
----------------------

- Replace newlines with spaces, and
- Remove empty lines from text

::
    
  def remove_empty_lines_and_leading_hyphens(text):
      lines = text.splitlines()
      non_empty_lines = [line for line in lines if line.strip()]
    
      # Remove leading hyphens
      stripped = [
          line[1:].lstrip() if line.startswith('-') else line
          for line in non_empty_lines
      ]
    
      cleaned_text = '\n'.join(stripped)
      return cleaned_text

  def replace_newlines_with_spaces(input_string):
      # An inexpensive method to remove empty lines without using extra logic such as leading hyphens.
      return input_string.replace('\n', ' ')
 
  if st.button(':small_red_triangle_down: &nbsp; **Replace newlines with spaces**', use_container_width=True):
      text = remove_empty_lines_and_leading_hyphens(text)
    
      with open(file_path, 'w', encoding='utf-8') as file:
          file.write(text)
        
      st.rerun()    


Call OpenAI API
---------------

::
    
  prompt_summarize = """You will be provided with statements in markdown, 
  and your task is to summarize the content you are provided.
  """

  prompt_improve = """You will be provided with statements in markdown, 
  and your task is to improve the content you are provided.
  """
  prompt_questions = """
  You will be provided with context in markdown,
  and your task is to generate 3 questions this context can provide
  specific answers to which are unlikely to be found elsewhere.

  Higher-level summaries of surrounding context may be provided
  as well. Try using these summaries to generate better questions
  that this context can answer.
  """

  if 'openai_result' not in st.session_state:
      st.session_state.openai_result = ""
 
  def call_llm(text, prompt):
      llm_models_persisted.select(llm_model)
      props = get_llm_properties(llm_model)
      llm_client = g_client if props["google"] else client

      if props["xml"]:
          messages = [
              {"role": "user", "content": f"<prompt>{prompt}</prompt>\n<query>{text}</query>"},
          ]
      else:
          messages = [
              {"role": "developer", "content": prompt},
              {"role": "user", "content": text},
          ]

      if props["temperature"]:
          response = llm_client.chat.completions.create(
              model=llm_model,
              messages=messages,
              temperature=llm_temperature,
          )
      else:
          response = llm_client.chat.completions.create(
              model=llm_model,
              messages=messages,
          )

      choice = response.choices[0]
      out_text = choice.message.content
      st.session_state.openai_result = out_text

      st.write(st.session_state.openai_result)

      with open(out_file, 'w') as file:
          file.write(out_text)
      st.sidebar.write(f'Response saved: `{out_file}`')  

      if platform.system() == 'Darwin':
          os.system("afplay /System/Library/Sounds/Glass.aiff")

Show OpenAI result.

::

  # st.write('---')
  st.write(st.session_state.openai_result)
  # st.write('---')

  if st.sidebar.button(':sparkles: &nbsp; Summarize', use_container_width=True):
      start_time = time.time()
      call_llm(text, prompt_summarize)
      end_time = time.time()
      st.session_state.execution_time = end_time - start_time
      st.rerun()
    
  if st.sidebar.button(':question: &nbsp; Ask questions', use_container_width=True):
      start_time = time.time()
      call_llm(text, prompt_questions)
      end_time = time.time()
      st.session_state.execution_time = end_time - start_time
      st.rerun()
 
  if st.sidebar.button(':thumbsup: &nbsp; Improve', use_container_width=True):
      start_time = time.time()
      call_llm(text, prompt_improve)
      end_time = time.time()
      st.session_state.execution_time = end_time - start_time
      st.rerun()
 
Convert to Asciidoc

::

  def convert_to_asciidoc(markdown):
      subprocess.run(["pandoc", "-f", "gfm", "-s", out_file, "-o", adoc_file], check=True)
      with open(adoc_file, "r", encoding="utf-8") as fin:
          result = fin.read()    
      return result

Copy to clipboard

::

  if len(st.session_state.openai_result) > 0:
      if st.sidebar.button(':clipboard: &nbsp; Copy to clipboard', type='primary', use_container_width=True):
          pyperclip.copy(st.session_state.openai_result)
          st.toast(f'Copied to clipboard')
        
Copy Asciidoc to clipboard

::

  def bump_headers(text: str, n: int) -> str:
      """Add n '=' characters to the start of each AsciiDoc header line."""
      if n == 0:
          return text
        
      prefix = '=' * n
      # Match lines starting with one or more '=' but not lines with only '=' (adornments)
      pattern = re.compile(r'^(=+)(?=\s)', re.MULTILINE)
      return pattern.sub(lambda m: prefix + m.group(1), text)
    
  def asciidoc_headers(content):
      # This will remove the entire line if it matches, including the newline.
      cleaned_content = re.sub(r'^\[\[.*?\]\]\s*\n', '', content, flags=re.MULTILINE)
      return cleaned_content
    
  bump_headers_n = st.sidebar.number_input("Bump headers", value=0, min_value=0)

  if len(st.session_state.openai_result) > 0:
      if st.sidebar.button(':clipboard: &nbsp; Copy Asciidoc to clipboard', type='primary', use_container_width=True):
          pyperclip.copy(asciidoc_headers(bump_headers(convert_to_asciidoc(st.session_state.openai_result), bump_headers_n)))
          st.toast(f'Copied to clipboard')
        
Show last execution time

::

  if "execution_time" in st.session_state:
      st.sidebar.write(f"Execution time: `{round(st.session_state.execution_time, 2)}` sec")