1. Go to this page and download the library: Download serafim/tf-idf library. Choose the download type require.
2. Extract the ZIP file and open the index.php.
3. Add this code to the index.php.
<?php
require_once('vendor/autoload.php');
/* Start to develop here. Best regards https://php-download.com/ */
serafim / tf-idf example snippets
$vectorizer = new \Serafim\TFIDF\Vectorizer();
$vectorizer->addFile(__DIR__ . '/path/to/file-1.txt');
$vectorizer->addFile(__DIR__ . '/path/to/file-2.txt');
foreach ($vectorizer->compute() as $document => $entries) {
var_dump($document);
foreach ($entries as $entry) {
var_dump($entry);
}
}
use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\Memory\FactoryInterface;
use Serafim\TFIDF\Memory\MemoryInterface;
$vectorizer = new Vectorizer(
memory: new class implements FactoryInterface {
// Method for creating a memory area for counters
public function create(): MemoryInterface
{
return new class implements MemoryInterface, \IteratorAggregate {
// Increment counter for the given term.
public function inc(string $term): void { /* ... */ }
// Return counter value for the given term or
// 0 if the counter is not found.
public function get(string $term): int { /* ... */ }
// Should return TRUE if there is a counter for the
// specified term.
public function has(string $term): bool { /* ... */ }
// Returns the number of registered counters.
public function count(): int { /* ... */ }
// Returns a list of terms and counter values in
// format: [ WORD => 42 ]
public function getIterator(): \Traversable { /* ... */ }
// Destruction of the allocated memory area.
public function __destruct() { /* ... */ }
};
}
}
);
use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\StopWords\FactoryInterface;
use Serafim\TFIDF\StopWords\StopWordsInterface;
$vectorizer = new Vectorizer(
stopWords: new class implements FactoryInterface {
public function create(string $locale): StopWordsInterface
{
// You can use a different set of stop word drivers depending
// on the locale ("$locale" argument) of the document.
return new class implements StopWordsInterface {
// TRUE should be returned if the word should be ignored.
// For example prepositions.
public function match(string $term): bool
{
return \in_array($term, ['and', 'or', /* ... */], true);
}
};
}
}
);
use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\Locale\IntlRepository;
$vectorizer = new Vectorizer(
locales: new class extends IntlRepository {
// Specifying the default locale
public function getDefault(): string
{
return 'en_US';
}
}
);
use Serafim\TFIDF\Vectorizer;
use Serafim\TFIDF\Tokenizer\TokenizerInterface;
use Serafim\TFIDF\Document\StreamingDocumentInterface;
use Serafim\TFIDF\Document\TextDocumentInterface;
$vectorizer = new Vectorizer(
tokenizer: new class implements TokenizerInterface {
// Please note that there can be several types of document:
// - Text Document: One that contains text in string representation.
// - Streaming Document: One that can be read and may contain a
// large amount of data.
public function tokenize(StreamingDocumentInterface|TextDocumentInterface $document): iterable
{
$content = $document instanceof StreamingDocumentInterface
? \stream_get_contents($document->getContentStream())
: $document->getContent();
// Please note that the document also contains the locale, based on
// which the term (word) separation logic can change.
//
// i.e. `if ($document->getLocale() === 'ar') { ... }`
//
return \preg_split('/[\s,]+/isum', $content);
}
}
);
Loading please wait ...
Before you can download the PHP files, the dependencies should be resolved. This can take some minutes. Please be patient.