PHP code example of serafim / tf-idf

1. Go to this page and download the library: Download serafim/tf-idf library. Choose the download type require.

2. Extract the ZIP file and open the index.php.

3. Add this code to the index.php.
    
        
<?php
require_once('vendor/autoload.php');

/* Start to develop here. Best regards https://php-download.com/ */

    

serafim / tf-idf example snippets


$vectorizer = new \Serafim\TFIDF\Vectorizer();

$vectorizer->addFile(__DIR__ . '/path/to/file-1.txt');
$vectorizer->addFile(__DIR__ . '/path/to/file-2.txt');

foreach ($vectorizer->compute() as $document => $entries) {
    var_dump($document);

    foreach ($entries as $entry) {
        var_dump($entry);
    }
}

Serafim\TFIDF\Document\FileDocument {
    locale: "ru_RU"
    pathname: "/home/example/how-it-works.md"
}

Serafim\TFIDF\Entry {
    term: "работает"
    occurrences: 4
    df: 1
    tf: 0.012012012012012
    idf: 0.69314718055995
    tfidf: 0.0083260922589783
}

Serafim\TFIDF\Entry {
    term: "php"
    occurrences: 26
    df: 2
    tf: 0.078078078078078
    idf: 0.0
    tfidf: 0.0
}

Serafim\TFIDF\Entry {
    term: "запуска"
    occurrences: 2
    df: 1
    tf: 0.006006006006006
    idf: 0.69314718055995
    tfidf: 0.0041630461294892
}

// ...etc...

$vectorizer = new \Serafim\TFIDF\Vectorizer();

$vectorizer->addFile(__DIR__ . '/path/to/file.txt');
$vectorizer->addFile(new \SplFileInfo(__DIR__ . '/path/to/file.txt'));
$vectorizer->addText('example text');
$vectorizer->addStream(fopen(__DIR__ . '/path/to/file.txt', 'rb'));

// OR

$vectorizer->add(new class implements \Serafim\TFIDF\Document\TextDocumentInterface {
    public function getLocale(): string { /* ... */ }
    public function getContent(): string { /* ... */ }
});

$vectorizer = new \Serafim\TFIDF\Vectorizer();

$file = $vectorizer->createFile(__DIR__ . '/path/to/file.txt');
$text = $vectorizer->createText('example text');
$stream = $vectorizer->createStream(fopen(__DIR__ . '/path/to/file.txt', 'rb'));

foreach ($vectorizer->compute() as $document => $result) { 
    // $document = object(Serafim\TFIDF\Document\DocumentInterface)
    // $result   = list<object(Serafim\TFIDF\Entry)>
}

$text = $vectorizer->createText('example text');

$result = $vectorizer->computeFor($text);

// $result = list<object(Serafim\TFIDF\Entry)>

use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\Memory\FactoryInterface;
use Serafim\TFIDF\Memory\MemoryInterface;

$vectorizer = new Vectorizer(
    memory: new class implements FactoryInterface {
        // Method for creating a memory area for counters
        public function create(): MemoryInterface
        {
            return new class implements MemoryInterface, \IteratorAggregate {
                // Increment counter for the given term.
                public function inc(string $term): void { /* ... */ }

                // Return counter value for the given term or
                // 0 if the counter is not found.
                public function get(string $term): int { /* ... */ }

                // Should return TRUE if there is a counter for the
                // specified term.
                public function has(string $term): bool { /* ... */ }

                // Returns the number of registered counters.
                public function count(): int { /* ... */ }

                // Returns a list of terms and counter values in
                // format: [ WORD => 42 ]
                public function getIterator(): \Traversable { /* ... */ }

                // Destruction of the allocated memory area.
                public function __destruct() { /* ... */ }
            };
        }
    }
);

use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\StopWords\FactoryInterface;
use Serafim\TFIDF\StopWords\StopWordsInterface;

$vectorizer = new Vectorizer(
    stopWords: new class implements FactoryInterface {
        public function create(string $locale): StopWordsInterface
        {
            // You can use a different set of stop word drivers depending
            // on the locale ("$locale" argument) of the document.
            return new class implements StopWordsInterface {
                // TRUE should be returned if the word should be ignored.
                // For example prepositions.
                public function match(string $term): bool
                {
                    return \in_array($term, ['and', 'or', /* ... */], true);
                }
            };
        }
    }
);

use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\Locale\IntlRepository;

$vectorizer = new Vectorizer(
    locales: new class extends IntlRepository {
        // Specifying the default locale
        public function getDefault(): string
        {
            return 'en_US';
        }
    }
);

use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\Tokenizer\TokenizerInterface;
use Serafim\TFIDF\Document\StreamingDocumentInterface;
use Serafim\TFIDF\Document\TextDocumentInterface;

$vectorizer = new Vectorizer(
    tokenizer: new class implements TokenizerInterface {
        // Please note that there can be several types of document:
        //  - Text Document: One that contains text in string representation.
        //  - Streaming Document: One that can be read and may contain a
        //    large amount of data.
        public function tokenize(StreamingDocumentInterface|TextDocumentInterface $document): iterable 
        {
            $content = $document instanceof StreamingDocumentInterface
                ? \stream_get_contents($document->getContentStream())
                : $document->getContent();

            // Please note that the document also contains the locale, based on
            // which the term (word) separation logic can change.
            //
            // i.e. `if ($document->getLocale() === 'ar') { ... }`
            //

            return \preg_split('/[\s,]+/isum', $content);
        }
    }
);