Introduction

In this tutorial, you will learn to combine Shinkai tools to create an AI tool that extracts the text content of a .pptx presentation, generates a text for a lesson about the presentation, and generates an audio file of this lesson. This tool is available in the Shinkai AI Store.

You will learn how to :

  • build and add features both using the Shinkai AI assistance and manually
  • combine Shinkai tools efficiently (optional features, customizability, config validation, error handling, design decisions)
  • implement Optical Character Recognition
  • implement text to speech
  • use the created tool

This tutorial is a step-by-step guide on how to implement the full tool. You can find the complete code below for reference, but we will go over its elements one by one. And you can see some usage examples in the last section of this tutorial (Part 5 : Using the tool).

Full Code
# /// script
# dependencies = [
#   "requests",
#   "python-pptx",
#   "pytesseract",
#   "pillow",
# ]
# ///

from typing import Optional, List
import os
import requests
import re
import shutil
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
import pytesseract
from PIL import Image
import io
from shinkai_local_support import get_home_path
from shinkai_local_tools import shinkai_llm_prompt_processor, eleven_labs_text_to_speech

class CONFIG:
    tesseract_path: str
    generate_audio: str  # "yes" or "no"

class INPUTS:
    file_path: str
    additional_lesson_generation_instructions: Optional[str] = "none"

class OUTPUT:
    content_of_the_presentation: Optional[List[str]]
    status: str
    message: Optional[str]
    lesson_about_the_presentation: Optional[str]
    audio_file: Optional[str]

def extract_text_from_text_frame(text_frame) -> str:
    return ' '.join([run.text for paragraph in text_frame.paragraphs for run in paragraph.runs if run.text])

def extract_text_from_table(table) -> str:
    cell_texts = []
    for row in table.rows:
        for cell in row.cells:
            cell_texts.append(extract_text_from_text_frame(cell.text_frame))
    return ' | '.join(cell_texts)

def extract_text_from_chart(chart) -> str:
    title = extract_text_from_text_frame(chart.chart_title.text_frame) if chart.has_title else ""
    x_axis = extract_text_from_text_frame(chart.category_axis.title.text_frame) if chart.category_axis.has_title else ""
    y_axis = extract_text_from_text_frame(chart.value_axis.title.text_frame) if chart.value_axis.has_title else ""
    return f"Title: {title}, X-Axis: {x_axis}, Y-Axis: {y_axis}"

def extract_text_from_shape(shape) -> List[str]:
    results = []
    
    if shape.has_text_frame:
        text = extract_text_from_text_frame(shape.text_frame)
        if text.strip():
            results.append(f"Text Shape: {text}")
    elif shape.has_table:
        table_text = extract_text_from_table(shape.table)
        if table_text.strip():
            results.append(f"Table: {table_text}")
    elif shape.has_chart:
        chart_text = extract_text_from_chart(shape.chart)
        if chart_text.replace("Title: , X-Axis: , Y-Axis: ", "").strip():
            results.append(f"Chart: {chart_text}")
    elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
        try:
            image_data = shape.image.blob
            image = Image.open(io.BytesIO(image_data))
            text = pytesseract.image_to_string(image)
            if text.strip():
                results.append(f"Image Text: {text.strip()}")
        except Exception as e:
            pass
    elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
        for sub_shape in shape.shapes:
            sub_texts = extract_text_from_shape(sub_shape)
            if sub_texts:
                results.extend(sub_texts)
    
    return results

def clean_special_characters(text: str) -> str:
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'#+\s*', ' ', text)
    text = re.sub(r'\*+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def get_base_filename(file_path: str) -> str:
    """Extract the base filename without extension from a path"""
    base_name = os.path.basename(file_path)
    if '?' in base_name:
        base_name = base_name.split('?')[0]
    return os.path.splitext(base_name)[0]

async def _read_presentation(file_path: str, working_file_path: str) -> OUTPUT:
    try:
        if file_path.startswith('http://') or file_path.startswith('https://'):
            response = requests.get(file_path)
            with open(working_file_path, 'wb') as f:
                f.write(response.content)
        else:
            with open(file_path, 'rb') as f:
                with open(working_file_path, 'wb') as w:
                    w.write(f.read())
        o = OUTPUT()
        o.status = "success"
        return o
    except Exception as e:
        o = OUTPUT()
        o.status = "error"
        o.message = f"Failed to read presentation: {str(e)}"
        return o

async def _retrieve_presentation(working_file_path: str) -> OUTPUT:
    if not os.path.exists(working_file_path):
        o = OUTPUT()
        o.status = "error"
        o.message = "No presentation file loaded."
        return o
    try:
        prs = Presentation(working_file_path)
        text_content = []
        for i, slide in enumerate(prs.slides, start=1):
            slide_content = []
            for shape in slide.shapes:
                shape_texts = extract_text_from_shape(shape)
                if shape_texts:
                    slide_content.extend(shape_texts)
            
            if slide_content:
                text_content.append(f"Slide {i}")
                text_content.extend(slide_content)
        
        o = OUTPUT()
        o.content_of_the_presentation = text_content
        o.status = "success"
        return o
    except Exception as e:
        o = OUTPUT()
        o.status = "error"
        o.message = f"Failed to process presentation: {str(e)}"
        return o

def validate_config(config: CONFIG) -> List[str]:
    """
    Validate configuration flags and dependencies.
    Returns a list of error messages if any issues are found.
    """
    errors = []
    # Validate tesseract_path
    if not config.tesseract_path:
        errors.append("Tesseract path is not provided.")
    elif not os.path.isfile(config.tesseract_path):
        errors.append(f"Tesseract path does not point to a file: {config.tesseract_path}")
    elif not os.access(config.tesseract_path, os.X_OK):
        errors.append(f"Tesseract executable is not executable: {config.tesseract_path}")
    
    # Validate generate_audio
    if config.generate_audio.lower() not in ["yes", "no"]:
        errors.append(f"Invalid config value for generate_audio: {config.generate_audio}. Must be 'yes' or 'no'.")
    
    return errors

async def run(config: CONFIG, inputs: INPUTS) -> OUTPUT:
    # Perform configuration validation
    errors = validate_config(config)
    if errors:
        o = OUTPUT()
        o.status = "error"
        o.message = "Configuration errors: " + "; ".join(errors)
        return o

    # Set Tesseract path after validation
    pytesseract.pytesseract.tesseract_cmd = config.tesseract_path

    home_path = await get_home_path()
    working_file_path = os.path.join(home_path, 'working.pptx')

    if not inputs.file_path:
        o = OUTPUT()
        o.status = "error"
        o.message = "No file path provided"
        return o

    read_result = await _read_presentation(inputs.file_path, working_file_path)
    if read_result.status == "error":
        return read_result

    process_result = await _retrieve_presentation(working_file_path)
    if process_result.status == "error":
        return process_result

    slides_content = process_result.content_of_the_presentation or []
    if not slides_content:
        o = OUTPUT()
        o.status = "warning"
        o.message = "No text content found in the presentation"
        o.content_of_the_presentation = []
        return o

    # Step 2: Formulate the prompt for the LLM
    prompt = (
        "<personality>"
        "You are a domain expert teacher extremely good at creating university level educational material."
        "</personality>"
        "<task>"
        "Create the text for an audio lesson of the following content extracted from a powerpoint."
        "</task>"
        "<context>"
        "The content comes from text extraction and OCR from a powerpoint, and is provided shape by shape and slide by slide."
        "Because of this the content form each slide might be a bit disorganised or sparse. You need to take your time step-by-step to reconnect the different pieces of content and understand how they fit together."
        "This content might also contain special characters coming from the text extraction formatting. Make sure to handle these special characters gracefully, they shouldn't appear in the text for an audio lesson."
        "</context>"
        "<lesson instructions>"
        "Do your best to understand in detailed the content of each slide, taking into consideration that it might be a bit disorganised due to the text extraction process."
        "The lesson should be very detailed and precise, and go over absolutely all the content in an organised manner. Be sure to use the entirety of the content for the lesson."
        "The lesson should be clear and engaging."
        "Make sure to introduce it when you move on to a new (sub)topic, so that the listener easily follow along."
        "</lesson instructions>"
        "<additional_instructions>"
        f"{inputs.additional_lesson_generation_instructions}"
        "</additional_instructions>"
        "<format instructions>"
        "Ensure the output is in one continuous paragraph without any special formatting or extra characters. Do not use and \ n formatting, just make it one long endless paragraph. It should be flowing text. You can still include sections in it but present them orally, not with special formatting."
        "The text must flow. Do not use any formatting, but just plain text."
        "</format instructions>"
        "<content>"
        "Here is the content to use for the lesson: "
        f"{' '.join(slides_content)}"
        "</content>"
    )

    # Step 3: Use the LLM to generate the audio lesson text
    llm_response = await shinkai_llm_prompt_processor({"prompt": prompt, "format": "text"})
    generated_text = llm_response['message']
    
    # Step 4: Clean any special characters from the generated text
    cleaned_text = clean_special_characters(generated_text)

    # Step 5: Convert the lesson text to audio if configured to do so
    audio_file_path = None
    if config.generate_audio.lower() == "yes":
        try:
            # Generate the audio with the TTS function
            tts_response = await eleven_labs_text_to_speech({"text": cleaned_text})
            generated_audio_path = tts_response['audio_file']
            
            # Get the base filename from the input file path and create the desired output path
            base_filename = get_base_filename(inputs.file_path)
            desired_audio_path = os.path.join(home_path, f"{base_filename}.mp3")
            
            # Rename the generated audio file to match the presentation name
            if os.path.exists(generated_audio_path):
                # Check if the destination file exists and remove it if it does
                if os.path.exists(desired_audio_path):
                    os.remove(desired_audio_path)
                # Copy the generated file to the desired location with the new name
                shutil.copy2(generated_audio_path, desired_audio_path)
                # Remove the original generated file
                os.remove(generated_audio_path)
                audio_file_path = desired_audio_path
            else:
                audio_file_path = generated_audio_path  # Fallback to the original path if something went wrong
        except Exception as e:
            o = OUTPUT()
            o.status = "error"
            o.message = f"Failed to generate or rename audio: {str(e)}"
            return o

    output = OUTPUT()
    output.content_of_the_presentation = slides_content
    output.lesson_about_the_presentation = cleaned_text
    output.audio_file = audio_file_path
    output.status = "success"
    return output

Now let’s see how to recreate this tool and learn its features and implementation details.

Prerequisites

To follow this tutorial, you will need :

Part 0 : Trying to build the full tool in 1 go with Shinkai AI assisted tool creation UI

Shinkai offers an effortless tool building experience thanks to its AI assisted tool creation UI, where even libraries dependencies and tool metadata are handled automatically.

You could try to build a working prototype of the full tool using 1 detailed prompt and a performant LLM.

In the tool creation UI, select a performant LLM (e.g. gpt_4o, shinkai_free_trial), select Python, activate the 2 tools “shinkai_llm_prompt_processor” and “eleven_labs_text_to_speech”, write a prompt describing the tool well, and execute it.

For a good result your prompt should be detailed and clearly describe :

  • the goal of the tool to create and its steps
  • how each of the selected tools should be used
  • what you would want in configuration versus inputs
  • which feature should be optional
  • how to handle errors

Below is an example of a promt to generate a full prototype of our PowerPoint to audio lesson tool. It uses tags to make things clear for the LLM. At the very least such prompt will create a good code flow for the intended tool, from which you can debug, edit, improve.

<task>
Create a tool that do 3 things :
step 1 : extract the text content of a .pptx file presentation.
step 2 : uses the shinkai_llm_prompt_processor tool to generate the text for a lesson about the presentation, using the extracted content.
step 3 : uses the eleven_labs_text_to_speech tool to create an audio file by converting the generated lesson text to audio.
</task>

<input instructions>
The .pptx file will be given as a path in input.
An other input will be optional lesson instructions, and default to 'none'.
Use strings for both.
</input instructions>

<configuration instructions>
2 configurations.
The path to the Tesseract executable.
A 'yes' or 'no' option for audio generation.
Use strings for both.
</configuration instructions>

<instructions for step 1>
Extract the text content of the .pptx file slide by side and shape by shape.
Use Tesseract OCR to extract text from picture shapes.
The executable path of Tesseract will be set in configuration.
</instructions for step 1>

<instructions for step 2>
First, add a check to see if the content extracted is empty or not. If it's empty say so in an error message. If there is content proceed with the lesson text generation.
Then create a strong prompt to generate the best lesson text from the extracted and format it well.
The prompt must explain that the content is coming from text extraction from a .pptx file, slide by slide and shape by shape, and so that a thorough effort should be made to best understand and put together the content of each slide.
The prompt must include a section for optional additional_lesson_instructions, that will be set in inputs by the user.
The prompt must include a format instructions section, saying that because the text is for audio conversion, it should be flowing, there shouldn't be any special formatting, it should be one long flowing paragraph.
The prompt must say to introduce each new(sub)topic, so that the listener can follow along well.
The lesson text should be clear and engaging.
</instructions for step 2>

<intermediary step instructions>
After the text generation, add an intermediary step to remove special characters if any where still present in the generated text. It should remove special character like ##, **, \n and so on.
</intermediary step instructions>

<instructions for step 3>
This audio generation step is optional. It will be set as 'yes' or 'no' in configuration.
Add a step to validate this configuration, it must be either 'yes' or 'no'.
Use the cleaned generated lesson text and the eleven labs tool to convert the file to audio.
Retrieve the name of the original .pptx file, and use this name to rename the audio file.
</instructions for step 3>

<error instructions>
Handle errors gracefully.
Each step should have its own informative error messaging.
</error instructions>

<output instructions>
The output must contain the extracted content, the generated text, the optional audio file, error logs and a status.
</output instructions>

Alternatively, you can build the tool progressively, step-by-step, by first building the content extraction part, then adding the lesson text generation, and finally the audio generation. Each step can be done with AI assistance and/or manually.

Below, you can study a step-by-step implemention of the tool.

Part 1 : Extracting the text content from a .pptx file

1.0 Using Shinkai AI assisted tool creation UI

You can try to build the content extraction feature first using the AI assistance, and then later on add the other features.

To do so, do not select any tool as this feature does not rely on any, and use a good prompt. Because the prompt would be short as it is about just one feature, you can make it very thorough and add details on how to build the tool without risking to overwhelm the LLM. Here is an example prompt to create a tool that extracts the text content from a .pptx file :

<task>
Create a tool that extracts the text content of a .pptx file presentation.
</task>

<input instructions>
The .pptx file will be given as a path in input.
Use string format.
</input instructions>

<configuration instructions>
1 configuration : the path to the Tesseract executable.
Use string.
</configuration instructions>

<instructions>
Extract the text content of the .pptx file slide by side and shape by shape.
Make sure to include text extraction from all possible shapes with text, including from tables and graphs.
Use Tesseract OCR to extract text from picture shapes.
The executable path of Tesseract will be set in configuration.
If no content was obtained for a shape, remove it from the output. Do not show shapes with empty content.
</instructions>

<error instructions>
Handle errors gracefully.
Each step should have its own informative error messaging.
</error instructions>

<output instructions>
The output must contain the extracted content, error logs and a status.
</output instructions>

At the very least such prompt should create a good code flow for the intended feature, from which you can debug, edit, improve.

Similar prompts were actually used to build the full code above, with step-by-step improvements through prompting and few manual coding.

Below you’ll find a full description of how to code the content extraction feature.

1.1 Defining the text extraction process

Import what will be needed :

# /// script
# dependencies = [
#   "requests",
#   "python-pptx",
#   "pytesseract",
#   "pillow",
# ]
# ///

from typing import Optional, List
import os
import requests
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
import pytesseract
from PIL import Image
import io
from shinkai_local_support import get_home_path

Define the configuration for the Tesseract executable path and the input for .pptx file path :

class CONFIG:
    tesseract_path: str

class INPUTS:
    file_path: str

Create the output class for the content, error messages and status :

class OUTPUT:
    content_of_the_presentation: Optional[List[str]]
    status: str
    message: Optional[str]

Define 3 functions to extract text from text blocks, tables and charts :

def extract_text_from_text_frame(text_frame) -> str:
   return ' '.join([run.text for paragraph in text_frame.paragraphs for run in paragraph.runs if run.text])  # Extracts and joins all text runs from paragraphs in a text frame

def extract_text_from_table(table) -> str:
   cell_texts = []  # List to store text from each cell
   for row in table.rows:
       for cell in row.cells:
           cell_texts.append(extract_text_from_text_frame(cell.text_frame))  # Extract text from each cell's text frame
   return ' | '.join(cell_texts)  # Join all cell texts with pipe separators

def extract_text_from_chart(chart) -> str:
   title = extract_text_from_text_frame(chart.chart_title.text_frame) if chart.has_title else ""  # Get chart title if it exists
   x_axis = extract_text_from_text_frame(chart.category_axis.title.text_frame) if chart.category_axis.has_title else ""  # Get x-axis title if it exists
   y_axis = extract_text_from_text_frame(chart.value_axis.title.text_frame) if chart.value_axis.has_title else ""  # Get y-axis title if it exists
   return f"Title: {title}, X-Axis: {x_axis}, Y-Axis: {y_axis}"  # Format chart data as a string

Create a function to extract the content from shapes using the functions defined above, plus use Tesseract OCR for picture shapes :

def extract_text_from_shape(shape) -> List[str]:
    results = []  # Initialize an empty list to store extracted text
    
    # Check if the shape contains a text frame
    if shape.has_text_frame:
        text = extract_text_from_text_frame(shape.text_frame)  # Extract text from the text frame
        if text.strip():  # If text is not empty after stripping whitespace
            results.append(f"Text Shape: {text}")  # Add it to results with a prefix
    
    # Check if the shape contains a table
    elif shape.has_table:
        table_text = extract_text_from_table(shape.table)  # Extract text from the table
        if table_text.strip():  # If table text is not empty
            results.append(f"Table: {table_text}")  # Add it to results with a prefix
    
    # Check if the shape contains a chart
    elif shape.has_chart:
        chart_text = extract_text_from_chart(shape.chart)  # Extract text from the chart
        # Check if chart text has content beyond empty labels
        if chart_text.replace("Title: , X-Axis: , Y-Axis: ", "").strip():
            results.append(f"Chart: {chart_text}")  # Add it to results with a prefix
    
    # Check if the shape is a picture
    elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
        try:
            image_data = shape.image.blob  # Get the image data as binary
            image = Image.open(io.BytesIO(image_data))  # Open image from binary data
            text = pytesseract.image_to_string(image)  # Use OCR to extract text from image
            if text.strip():  # If OCR found text
                results.append(f"Image Text: {text.strip()}")  # Add it to results with a prefix
        except Exception as e:
            pass  # Silently ignore any errors in image processing
    
    # Check if the shape is a group of shapes
    elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
        for sub_shape in shape.shapes:  # Iterate through all shapes in the group
            sub_texts = extract_text_from_shape(sub_shape)  # Recursively process each sub-shape
            if sub_texts:  # If any text was found
                results.extend(sub_texts)  # Add all results to the main list
    
    return results  # Return the list of all extracted text items

Define a function to read the presentation from either URL or local file path. It’s a step to check the file is readable :

async def read_presentation(file_path: str, working_file_path: str) -> OUTPUT:
    try:
        # If the file is from a URL, download it
        if file_path.startswith('http://') or file_path.startswith('https://'):
            response = requests.get(file_path)  # Send HTTP request to get the file
            with open(working_file_path, 'wb') as f:  # Open destination file in binary write mode
                f.write(response.content)  # Write the downloaded content to the file
        # If it's a local file path, copy it to the working location
        else:
            with open(file_path, 'rb') as f:  # Open source file in binary read mode
                with open(working_file_path, 'wb') as w:  # Open destination file in binary write mode
                    w.write(f.read())  # Read the entire source file and write to destination
        
        # Create a success response
        o = OUTPUT()  # Initialize result object
        o.status = "success"  # Set status to indicate successful operation
        return o  # Return the result
    
    except Exception as e:
        # Handle any errors that occur during file operations
        o = OUTPUT()  # Initialize result object
        o.status = "error"  # Set status to indicate error
        o.message = f"Failed to read presentation: {str(e)}"  # Include error details in message
        return o  # Return the error result

1.2 Retrieving the content shape by shape and slide by slide

Create a function that applies the content extraction slide by slide and shape by shape :

async def *retrieve*presentation(working_file_path: str) -> OUTPUT:
    # Check if the file exists
    if not os.path.exists(working_file_path):
        o = OUTPUT()
        o.status = "error"
        o.message = "No presentation file loaded."
        return o
    
    try:
        # Load the presentation file
        prs = Presentation(working_file_path)
        text_content = []
        
        # Iterate through each slide
        for i, slide in enumerate(prs.slides, start=1):
            slide_content = []
            
            # Extract text from all shapes in the slide
            for shape in slide.shapes:
                shape_texts = extract_text_from_shape(shape)
                if shape_texts:
                    slide_content.extend(shape_texts)
            
            # Add slide number and content if there's any text
            if slide_content:
                text_content.append(f"Slide {i}")
                text_content.extend(slide_content)
        
        # Create successful output object with extracted content
        o = OUTPUT()
        o.content_of_the_presentation = text_content
        o.status = "success"
        return o
        
    except Exception as e:
        # Handle any errors during processing
        o = OUTPUT()
        o.status = "error"
        o.message = f"Failed to process presentation: {str(e)}"
        return o

1.3 Validate the configuration

Implement a validation function to stop the tool and log errors if there are issues with the Tesseract OCR implementation.

def validate_config(config: CONFIG) -> List[str]:
    """ Validate configuration dependencies.
    Returns a list of error messages if any issues are found.
    """
    errors = []  # Initialize empty list to collect error messages
    
    # Validate tesseract_path
    if not config.tesseract_path:
        # Check if path is empty or None
        errors.append("Tesseract path is not provided.")
    elif not os.path.isfile(config.tesseract_path):
        # Check if path points to an actual file
        errors.append(f"Tesseract path does not point to a file: {config.tesseract_path}")
    elif not os.access(config.tesseract_path, os.X_OK):
        # Check if file has executable permissions
        errors.append(f"Tesseract executable is not executable: {config.tesseract_path}")
    
    return errors  # Return collected error messages

1.4 Run function to execute all the processes

Define a run function using all the functions defined above. At the end add a step to check if the extracted content is empty or not. It’s a useful step because later the tool will use this extracted content to generate a lesson text, and this check will ensure there is a content, and stop the tool and inform the user if there isn’t, saving compute and time.

async def run(config: CONFIG, inputs: INPUTS) -> OUTPUT:
    # Perform configuration validation
    errors = validate_config(config)  # Validates the provided configuration
    if errors:
        o = OUTPUT()
        o.status = "error"  # Set error status
        o.message = "Configuration errors: " + "; ".join(errors)  # Combines all errors into a single message
        return o
    
    # Set Tesseract path after validation
    pytesseract.pytesseract.tesseract_cmd = config.tesseract_path  # Configure OCR tool path from config
    
    home_path = await get_home_path()  # Asynchronously get the home directory
    working_file_path = os.path.join(home_path, 'working.pptx')  # Create path for temporary working file
    
    if not inputs.file_path:
        o = OUTPUT()
        o.status = "error"  # Set error status
        o.message = "No file path provided"  # Error if no input file specified
        return o
    
    read_result = await read_presentation(inputs.file_path, working_file_path)  # Read and copy presentation to working file
    if read_result.status == "error":
        return read_result  # Early return if reading fails
    
    process_result = await retrieve_presentation(working_file_path)  # Extract content from the working presentation
    if process_result.status == "error":
        return process_result  # Early return if processing fails
    
    slides_content = process_result.content_of_the_presentation or []  # Get slide content or empty list if None
    
    if not slides_content:
        o = OUTPUT()
        o.status = "warning"  # Set warning status
        o.message = "No text content found in the presentation"  # Warning if presentation has no text
        o.content_of_the_presentation = []
        return o
    
    output = OUTPUT()
    output.content_of_the_presentation = slides_content  # Store extracted content
    output.status = "success"  # Set success status
    return output

Part 2 : Adding a LLM prompt processor to generate a lesson text

2.0 Using Shinkai AI assisted tool creation UI

Now you can use the AI assisted tool creation to add a step which generates the lesson text, using the slides content extracted in the first step.

To do so, activate the tool ‘shinkai_llm_prompt_processor’, and use a prompt similar to this one :

<task>
Add a step that uses the shinkai_llm_prompt_processor tool to generate the text for a lesson about the presentation, using the extracted slides content.
Adapt the code. Keep everything else the same.
</task>

<input instructions>
Add a string input for additional lesson instructions to generate the lesson text. It defaults to 'None' if empty.
</input instructions>

<prompt instructions>
Create a strong prompt to generate the best lesson text from the extracted and format it well.
The prompt must explain that the content is coming from text extraction from a .pptx file, slide by slide and shape by shape, and so that a thorough effort should be made to best understand and put together the content of each slide.
The prompt must include a section for optional additional_lesson_instructions, that will be set in inputs by the user.
The prompt must include a format instructions section, saying that because the text is for audio conversion, it should be flowing, there shouldn't be any special formatting, it should be one long flowing paragraph.
The prompt must say to introduce each new(sub)topic, so that the listener can follow along well.
The lesson text should be clear and engaging.
</prompt instructions>

<additional step instructions>
After the lesson text generation, add a step to remove special characters if any are still present in the generated text. It should remove special character like ##, **, \n and so on.
</additional step instructions>

<error instructions>
Handle errors gracefully.
Each step should have its own informative error messaging.
</error instructions>

<output instructions>
The output must contain the extracted content, the generated text, error logs and a status.
</output instructions>

At the very least such prompt should add a good code flow for the intended additional feature, from which you can debug, edit, improve.

Similar prompts were actually used to add this next feature, with step-by-step improvements through prompting and few manual coding.

Below you’ll find a full description of the code to add a lesson text generation feature.

2.1 Setting up the lesson text generation feature

Import the ‘shinkai_llm_prompt_processor’ tool. Also add ‘re’ to the imports, it will be used to clean the generated text.

import re
from shinkai_local_tools import shinkai_llm_prompt_processor

Add an input for additional instructions to generate the lesson text, so that the user can customize it. Set default to ‘none’.

class INPUTS:
    file_path: str
    additional_lesson_generation_instructions: Optional[str] = "none"

Add an output for the generated lesson :

class OUTPUT:
    content_of_the_presentation: Optional[List[str]]
    status: str
    message: Optional[str]
    lesson_about_the_presentation: Optional[str]

2.2 Using an elaborate prompt to generate optimal lesson text

In the run function, add a step to define a detailed prompt for the text generation. Give some context describing the type of content the LLM will use and its specificities. Include formatting instructions. Include the optional additional instructions coming from the user. Organise it well and use tags to make things clear for the LLM :

    # Step 2: Formulate the prompt for the LLM
    prompt = (
        "<personality>"
        "You are a domain expert teacher extremely good at creating university level educational material."
        "</personality>"
        "<task>"
        "Create the text for an audio lesson of the following content extracted from a powerpoint."
        "</task>"
        "<context>"
        "The content comes from text extraction and OCR from a powerpoint, and is provided shape by shape and slide by slide."
        "Because of this the content form each slide might be a bit disorganised or sparse. You need to take your time step-by-step to reconnect the different pieces of content and understand how they fit together."
        "This content might also contain special characters coming from the text extraction formatting. Make sure to handle these special characters gracefully, they shouldn't appear in the text for an audio lesson."
        "</context>"
        "<lesson instructions>"
        "Do your best to understand in detailed the content of each slide, taking into consideration that it might be a bit disorganised due to the text extraction process."
        "The lesson should be very detailed and precise, and go over absolutely all the content in an organised manner. Be sure to use the entirety of the content for the lesson."
        "The lesson should be clear and engaging."
        "Make sure to introduce it when you move on to a new (sub)topic, so that the listener can easily follow along."
        "</lesson instructions>"
        "<additional_instructions>"
        f"{inputs.additional_lesson_generation_instructions}"
        "</additional_instructions>"
        "<format instructions>"
        "Ensure the output is in one continuous paragraph without any special formatting or extra characters. Do not use and \ n formatting, just make it one long endless paragraph. It's should be flowing text. You can still include sections in it but present them orally, not with special formatting."
        "The text must flow. Do not use any formatting, but just plain text."
        "</format instructions>"
        "<content>"
        "Here is the content to use for the lesson: "
        f"{' '.join(slides_content)}"
        "</content>"
    )

2.3 Calling the LLM prompt processor tool

Just under, add a step to call the LLM prompt processor tool, using the prompt defined above.

    # Step 3: Use the LLM to generate the audio lesson text
    llm_response = await shinkai_llm_prompt_processor({"prompt": prompt, "format": "text"})
    generated_text = llm_response['message']

2.4 Cleaning the text

Along with the previously defined functions at the top of the code, add a function to clean the generated lesson text from special characters, in case the LLM includes some despite our prompt format instructions :

def clean_special_characters(text: str) -> str:
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'#+\s*', ' ', text)
    text = re.sub(r'\*+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

Add a step in the run function to use it :

    # Step 4: Clean any special characters from the generated text
    cleaned_text = clean_special_characters(generated_text)

Edit the outputs of the run function to also include the cleaned generated text.

    output = OUTPUT()
    output.content_of_the_presentation = slides_content
    output.lesson_about_the_presentation = cleaned_text
    output.status = "success"
    return output

Part 3 : Adding an optional text to speech feature to create an audio file of the lesson

3.0 Using Shinkai AI assisted tool creation UI

Now you can use the AI assisted tool creation to add a final optional step which generates an audio file of the cleaned lesson text generated by the 2nd feature of the tool.

To do so, activate the tool ‘eleven_labs_text_to_speech’, and use a prompt similar to this one :

<task>
Add an optional step that uses the eleven_labs_text_to_speech tool to create an audio file by converting the generated lesson text to audio.
Adapt the code. Keep everything else the same.
</task>

<configuration instructions>
Add a string configuration for optional generation of the audio, it's either 'yes' or 'no', it defaults to 'no'.
</configuration instructions>

<instructions>
This audio generation step is optional. It will be set as 'yes' or 'no' in configuration.
Add a step to validate this configuration, it must check it's either 'yes' or 'no'.
Use the cleaned generated lesson text and the eleven labs tool to convert the file to audio.
Retrieve the name of the original .pptx file, and use this name to rename the audio file.
</instructions>

<error instructions>
Handle errors gracefully.
Each step should have its own informative error messaging.
</error instructions>

<output instructions>
The output must contain the extracted content, the generated text, the optional audio file, error logs and a status.
</output instructions>

At the very least such prompt should add a good code flow for the intended last feature, from which you can debug, edit, improve.

Similar prompts were actually used to add this next feature, with step-by-step improvements through prompting and few manual coding.

Below you’ll find a full description of the code to add the audio file generation.

3.1 Setting up the audio file generation feature

Near the start of the code add the ‘eleven_labs_text_to_speech’ tool to the import. And also ‘shutil’ (used for file operations).

import shutil
from shinkai_local_tools import shinkai_llm_prompt_processor, eleven_labs_text_to_speech

Edit config and output classes to also include the option to generate the audio and the optional audio file.

class CONFIG:
    tesseract_path: str
    generate_audio: str  # "yes" or "no"

class OUTPUT:
    content_of_the_presentation: Optional[List[str]]
    status: str
    message: Optional[str]
    lesson_about_the_presentation: Optional[str]
    audio_file: Optional[str]

Define a function to get the name of the .pptx file. It will be used to save the audio file with the same name.

def get_base_filename(file_path: str) -> str:
    """Extract the base filename without extension from a path"""
    base_name = os.path.basename(file_path)
    if '?' in base_name:
        base_name = base_name.split('?')[0]
    return os.path.splitext(base_name)[0]

Add a step to the validate_config function to also check the configuration of the optional audio generation.

    # Validate generate_audio
    if config.generate_audio.lower() not in ["yes", "no"]:
        errors.append(f"Invalid config value for generate_audio: {config.generate_audio}. Must be 'yes' or 'no'.")

3.2 Calling the text to speech tool

Add a step to the run function to use the ‘eleven_labs_text_to_speech’ tool. This step is optional according to the configuration. Add a step to change the name of the audio file generated by the text-to-speech tool : make it more user friendly by simply using the name of the original .pptx file. Also include an error message if the audio file generation failed.

    # Step 5: Convert the lesson text to audio if configured to do so
    audio_file_path = None
    if config.generate_audio.lower() == "yes":
        try:
            # Generate the audio with the TTS function
            tts_response = await eleven_labs_text_to_speech({"text": cleaned_text})
            generated_audio_path = tts_response['audio_file']
            
            # Get the base filename from the input file path and create the desired output path
            base_filename = get_base_filename(inputs.file_path)
            desired_audio_path = os.path.join(home_path, f"{base_filename}.mp3")
            
            # Rename the generated audio file to match the presentation name
            if os.path.exists(generated_audio_path):
                # Check if the destination file exists and remove it if it does
                if os.path.exists(desired_audio_path):
                    os.remove(desired_audio_path)
                # Copy the generated file to the desired location with the new name
                shutil.copy2(generated_audio_path, desired_audio_path)
                # Remove the original generated file
                os.remove(generated_audio_path)
                audio_file_path = desired_audio_path
            else:
                audio_file_path = generated_audio_path  # Fallback to the original path if something went wrong
        except Exception as e:
            o = OUTPUT()
            o.status = "error"
            o.message = f"Failed to generate or rename audio: {str(e)}"
            return o

Edit the outputs of the run function to also include the generated audio file.

    output = OUTPUT()
    output.content_of_the_presentation = slides_content
    output.lesson_about_the_presentation = cleaned_text
    output.audio_file = audio_file_path
    output.status = "success"
    return output

Please note that you could also modify the tool to use another text-to-speech provider, even a local one.

Part 4 : Improving the metadata of the tool

Shinkai automated the tool metadata generation, but you can improve it.

Good tool metadata should include :

  • an explicit tool title
  • a thorough description (features, options, requirements, extra information)
  • explicit descriptions for configurations and inputs
  • adequate usable keywords to trigger the tool

Go to the metadata section, and improve the above. Below is a good metadata for the tool.

Title :

PPTX Content Extractor With OCR And Audio Lesson Generator

Description :

This tool extracts text from a PowerPoint presentation, creates a lesson text about it, and generates an audio file of the lesson.
It extracts content slide by slide and shape by shape.
It uses OCR to extract text from image shapes (using Tesseract).
It generates a detailed lesson text about the .pptx file using a LLM and optional additional instructions.
Optionally, it generates a mp3 audio file of the lesson using ElevenLabs text-to-speech.
This tool requires Tesseract to be installed for OCR. It also requires the tool “ElevenLabs Text to Speech” from the Shinkai AI Store, and an API key for ElevenLabs text-to-speech if choosing to generate audio files.
You can download Tesseract from here : https://github.com/UB-Mannheim/tesseract/wiki. You can get an ElevenLabs API key here : https://elevenlabs.io/developers.

Metadata JSON :

{
  "homePage": "",
  "configurations": {
    "type": "object",
    "properties": {
      "generate_audio": {
        "description": "'yes' or 'no' flag to optionally generate an audio file of the lesson.",
        "type": "string"
      },
      "tesseract_path": {
        "description": "Path to the Tesseract executable (e.g. C:\\Program Files\\Tesseract-OCR\\tesseract.exe).",
        "type": "string"
      }
    },
    "required": [
      "generate_audio",
      "tesseract_path"
    ]
  },
  "parameters": {
    "type": "object",
    "properties": {
      "additional_lesson_generation_instructions": {
        "description": "Optional additional instructions for lesson text generation",
        "type": "string"
      },
      "file_path": {
        "description": "Path or URL of the PowerPoint file (e.g. C:\\Users\\gille\\Desktop\\reef_photogrammetry_presentation.pptx).",
        "type": "string"
      }
    },
    "required": [
      "file_path"
    ]
  },
  "result": {
    "type": "object",
    "properties": {
      "audio_file": {
        "description": "Path to the generated audio file",
        "nullable": true,
        "type": "string"
      },
      "content_of_the_presentation": {
        "description": "Text content extracted from the presentation",
        "items": {
          "type": "string"
        },
        "type": "array"
      },
      "lesson_about_the_presentation": {
        "description": "Text of the generated lesson",
        "nullable": true,
        "type": "string"
      },
      "message": {
        "description": "Error message or additional info",
        "nullable": true,
        "type": "string"
      },
      "status": {
        "description": "The status of the operation",
        "type": "string"
      }
    },
    "required": [
      "status"
    ]
  },
  "sqlTables": [],
  "sqlQueries": [],
  "oauth": [],
  "runner": "any",
  "operating_system": [
    "linux",
    "macos",
    "windows"
  ],
  "tool_set": "",
  "keywords": [
    "pptx",
    "powerpoint",
    "lesson",
    "presentation",
    "audio"
  ],
  "version": "1.0.0"
}

Now the tool should be complete. Save it.

Below you’ll find usage examples.

Part 5 : Using the tool ‘PPTX Content Extractor With OCR And Audio Lesson Generator’

5.1 Installing extra components and setting up configurations

Install Tesseract for OCR, and set its executable path in the configuration of the ‘PPTX Content Extractor With OCR And Audio Lesson Generator’ tool.

Install the ‘eleven_labs_text_to_speech’ tool from the Shinkai AI Store. Get an ElevenLabs API key with some credits. Go to the configuration tab of this ElevenLabs Shinkai tool and set your API key and pick a voice.

Set audio generation to ‘yes’ or ‘no’ in the configuration of the ‘PPTX Content Extractor With OCR And Audio Lesson Generator’.

5.2 Usage examples

To generate an audio lesson of a .pptx file, have the audio generation set on ‘yes’ in configuration, and in your prompt simply mention the filename.

To simply interact through prompts with the content of the .pptx file, have the audio generation set on ‘no’, include the file path in your prompt, and add instructions.

Because the content is extracted slide by slide, you can also ask about specific slides.