| Current Path : /home/users/unlimited/www/ultimate-ai.codeskitter.site/app/Services/Chatbot/ |
| Current File : /home/users/unlimited/www/ultimate-ai.codeskitter.site/app/Services/Chatbot/LinkCrawler.php |
<?php
namespace App\Services\Chatbot;
use App\Helpers\Classes\Helper;
use Exception;
/**
* Class LinkCrawler
*
* A simple web crawler for extracting content and links from a given website.
*
* @since 1.3
*/
class LinkCrawler
{
/**
* @var string the base URL of the website to crawl
*/
private $baseUrl;
/**
* @var array an array to store crawled links
*/
private $links = [];
/**
* @var int the maximum number of links to crawl
*/
private $maxLinks = 30;
/**
* @var array an array of invalid paths to skip during crawling
*/
private $invalidPaths = ['/cdn-cgi/'];
/**
* @var array an array to store contents of crawled pages
*/
private $contents = [];
/**
* MagicAI_LinkCrawler constructor.
*
* @param string $url the base URL of the website to crawl
*/
public function __construct($url)
{
$this->baseUrl = $url;
}
/**
* Initiate crawling process.
*/
public function crawl($is_single = false)
{
if ($is_single) {
$this->crawlSinglePage($this->baseUrl);
} else {
$this->crawlPage($this->baseUrl);
}
}
/**
* Recursively crawl a page and its links.
*
* @param string $url the URL of the page to crawl
*/
private function crawlPage($url)
{
$html = file_get_contents($url);
$text = $this->stripTagsExceptContent($html);
$this->contents[$url] = $text;
preg_match_all('/<a\s+(?:[^>]*?\s+)?href="([^"]*)"/', $html, $matches);
foreach ($matches[1] as $link) {
$absoluteLink = $this->makeAbsoluteUrl($link);
if ($absoluteLink && ! in_array($absoluteLink, $this->links) && $this->isSameDomain($absoluteLink, $this->baseUrl) && ! $this->hasInvalidPath($absoluteLink) && ! $this->isImage($absoluteLink)) {
$this->links[] = $absoluteLink;
if (count($this->links) >= $this->maxLinks) {
return;
}
try {
$this->crawlPage($absoluteLink);
} catch (Exception $e) {
continue;
}
}
}
}
/**
* Recursively crawl a page
*
* @param string $url the URL of the page to crawl
*/
private function crawlSinglePage($url)
{
$html = file_get_contents($url);
$text = $this->stripTagsExceptContent($html);
$this->contents[$url] = $text;
}
/**
* Make a relative URL absolute.
*
* @param string $url the relative URL
*
* @return string|null the absolute URL, or null if unable to make absolute
*/
private function makeAbsoluteUrl($url)
{
if (strpos($url, 'http') === 0 || strpos($url, 'https') === 0) {
return $url;
}
if (strpos($url, '/') === 0) {
return parse_url($this->baseUrl, PHP_URL_SCHEME) . '://' . parse_url($this->baseUrl, PHP_URL_HOST) . $url;
}
return null;
}
/**
* Check if two URLs are of the same domain.
*
* @param string $url1 first URL
* @param string $url2 second URL
*
* @return bool true if URLs are of the same domain, false otherwise
*/
private function isSameDomain($url1, $url2)
{
$domain1 = parse_url($url1, PHP_URL_HOST);
$domain2 = parse_url($url2, PHP_URL_HOST);
return $domain1 === $domain2;
}
/**
* Check if a URL contains any of the invalid paths.
*
* @param string $url the URL to check
*
* @return bool true if URL contains invalid paths, false otherwise
*/
private function hasInvalidPath($url)
{
foreach ($this->invalidPaths as $invalidPath) {
if (strpos($url, $invalidPath) !== false) {
return true;
}
}
return false;
}
/**
* Check if a URL points to an image.
*
* @param string $url the URL to check
*
* @return bool true if URL points to an image, false otherwise
*/
private function isImage($url)
{
$imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'apng', 'avif', 'svg', 'webp', 'ico', 'tiff'];
$extension = pathinfo($url, PATHINFO_EXTENSION);
return in_array(strtolower($extension), $imageExtensions);
}
/**
* Strip HTML tags from content, except for specified elements.
*
* @param string $html the HTML content to strip tags from
*
* @return string the stripped text content
*/
private function stripTagsExceptContent($html)
{
$html = preg_replace('/<header\b[^>]*>.*?<\/header>/is', '', $html);
$html = preg_replace('/<footer\b[^>]*>.*?<\/footer>/is', '', $html);
$html = preg_replace('/<[^>]+class="[^"]*\bscreen-reader-text\b[^"]*"[^>]*>.*?<\/[^>]+>/is', '', $html);
$html = preg_replace('/<[^>]+class="[^"]*\bscreen-reader-shortcut\b[^"]*"[^>]*>.*?<\/[^>]+>/is', '', $html);
$text = Helper::strip_all_tags($html, true);
return $text;
}
/**
* Get the contents of crawled pages.
*
* @return array the contents of crawled pages
*/
public function getContents()
{
return $this->contents;
}
/**
* Get the crawled links.
*
* @return array the crawled links
*/
public function getLinks()
{
return $this->links;
}
}