PHP code example of benbjurstrom / markdown-object

1. Go to this page and download the library: Download benbjurstrom/markdown-object library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

benbjurstrom / markdown-object example snippets


use League\CommonMark\Environment\Environment;
use League\CommonMark\Parser\MarkdownParser;
use League\CommonMark\Extension\CommonMark\CommonMarkCoreExtension;
use League\CommonMark\Extension\Table\TableExtension;
use BenBjurstrom\MarkdownObject\Build\MarkdownObjectBuilder;
use BenBjurstrom\MarkdownObject\Tokenizer\TikTokenizer;

// 1) Parse Markdown with CommonMark
$env = new Environment();
$env->addExtension(new CommonMarkCoreExtension());
$env->addExtension(new TableExtension());

$parser   = new MarkdownParser($env);
$filename = 'guide.md';
$markdown = file_get_contents($filename);
$doc      = $parser->parse($markdown);

// 2) Build the structured model
$builder   = new MarkdownObjectBuilder();
$tokenizer = TikTokenizer::forModel('gpt-3.5-turbo');
$mdObj     = $builder->build($doc, $filename, $markdown, $tokenizer);

// 3) Emit hierarchically-packed chunks
$chunks = $mdObj->toMarkdownChunks(target: 512, hardCap: 1024);

foreach ($chunks as $chunk) {
    echo "---\n";
    echo "Chunk: {$chunk->id} | {$chunk->tokenCount} tokens";

    // Source position tracking for finding chunks in original document
    $pos = $chunk->sourcePosition;
    if ($pos->lines !== null) {
        echo " | Line: {$pos->lines->startLine}";
    }
    echo "\n";
    echo implode(' › ', $chunk->breadcrumb) . "\n";
    echo "---\n\n";
    echo $chunk->markdown . "\n\n";
}

/*
---
Chunk: 1 | 163 tokens | Line: 1
demo.md › Getting Started
---

# Getting Started

Welcome to the Markdown Object demo! This tool helps you visualize how markdown is parsed and chunked.

## Features

### Real-time Processing

Type or paste markdown in the left pane and see the results instantly.

### Hierarchical Chunking

Content is automatically organized into semantic chunks that keep related information together…

---
Chunk: 2 | 287 tokens | Line: 18
demo.md › Getting Started › Advanced Options
---

## Advanced Options

Configure chunking parameters to see how different settings affect the output.

### Token Limits

Adjust the target and hard cap values to control chunk sizes…
*/

// Serialize to JSON
$json = $mdObj->toJson(JSON_PRETTY_PRINT);

// Deserialize from JSON
$copy = \BenBjurstrom\MarkdownObject\Model\MarkdownObject::fromJson($json);

use BenBjurstrom\MarkdownObject\Tokenizer\TikTokenizer;

// Use a different model
$tokenizer = TikTokenizer::forModel('gpt-4');

// Or use a specific encoding
$tokenizer = TikTokenizer::forEncoding('p50k_base');

// Pass to both build() and toMarkdownChunks()
$mdObj = $builder->build($doc, $filename, $markdown, $tokenizer);
$chunks = $mdObj->toMarkdownChunks(
    target: 512,
    hardCap: 1024,
    tok: $tokenizer
);

$chunks = $mdObj->toMarkdownChunks(
    target: 256,                // Smaller target for content splitting
    hardCap: 512,               // Smaller hard cap for hierarchy
    tok: $customTokenizer,      // Optional: use different tokenizer
    repeatTableHeaders: false   // Optional: don't repeat headers in split tables
);

// Build-time: sum of nodes (no separators)
echo $mdObj->tokenCount;  // e.g., 155

// Chunk: