Refresh
This commit is contained in:
106
app/Analyzers/Analyzer.php
Executable file
106
app/Analyzers/Analyzer.php
Executable file
@@ -0,0 +1,106 @@
|
||||
<?php
|
||||
|
||||
namespace App\Analyzers;
|
||||
|
||||
use App\Models\Document;
|
||||
use Illuminate\Http\Client\Response;
|
||||
use Storage;
|
||||
|
||||
abstract class Analyzer
|
||||
{
|
||||
/**
|
||||
* Document being analyzed.
|
||||
*
|
||||
* @var \App\Models\Document
|
||||
*/
|
||||
protected $document;
|
||||
|
||||
/**
|
||||
* File content.
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
protected $body;
|
||||
|
||||
/**
|
||||
* Document details (meta data).
|
||||
*
|
||||
* @var mixed
|
||||
*/
|
||||
protected $details;
|
||||
|
||||
/**
|
||||
* Provides temporary access to response to analyzers.
|
||||
*
|
||||
* @var \Illuminate\Http\Client\Response
|
||||
*/
|
||||
protected $response;
|
||||
|
||||
/**
|
||||
* Associate Cyca's document being analized.
|
||||
*
|
||||
* @return self
|
||||
*/
|
||||
public function setDocument(Document $document)
|
||||
{
|
||||
$this->document = $document;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Document's body as fetched by Cyca.
|
||||
*
|
||||
* @param string $body
|
||||
*
|
||||
* @return self
|
||||
*/
|
||||
public function setBody($body)
|
||||
{
|
||||
$this->body = $body;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* HTTP response when fetching document.
|
||||
*
|
||||
* @return self
|
||||
*/
|
||||
public function setResponse(Response $response)
|
||||
{
|
||||
$this->response = $response;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store details on disk.
|
||||
*/
|
||||
protected function storeDetailsOnDisk()
|
||||
{
|
||||
if (empty($this->details)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$storageRoot = $this->document->getStoragePath();
|
||||
$metaFilename = sprintf('%s/meta.json', $storageRoot);
|
||||
|
||||
Storage::put($metaFilename, json_encode($this->details));
|
||||
}
|
||||
|
||||
/**
|
||||
* Store some details in database. This method uses an array to map document
|
||||
* properties to metadata properties.
|
||||
*
|
||||
* @param array $mappings
|
||||
*/
|
||||
protected function applyDetailsToDocument($mappings = [])
|
||||
{
|
||||
foreach ($mappings as $documentKey => $detailsKey) {
|
||||
if (!empty($this->details[$detailsKey])) {
|
||||
$this->document->{$documentKey} = $this->details[$detailsKey];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
34
app/Analyzers/ExifAnalyzer.php
Executable file
34
app/Analyzers/ExifAnalyzer.php
Executable file
@@ -0,0 +1,34 @@
|
||||
<?php
|
||||
|
||||
namespace App\Analyzers;
|
||||
|
||||
/**
|
||||
* Extract information from a supported image file.
|
||||
*/
|
||||
class ExifAnalyzer extends Analyzer
|
||||
{
|
||||
/**
|
||||
* Analyzes document.
|
||||
*/
|
||||
public function analyze()
|
||||
{
|
||||
if (empty($this->body)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$bodyPath = storage_path('app/'.$this->document->getStoragePath().'/body');
|
||||
$this->details = exif_read_data($bodyPath, null, true, true);
|
||||
|
||||
if (empty($this->details)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$this->document->description = (string) view('partials.details.image')->with([
|
||||
'exif' => $this->details,
|
||||
'url' => asset(str_replace('public', 'storage', $this->document->getStoragePath()).'/body'),
|
||||
]);
|
||||
|
||||
$this->storeDetailsOnDisk();
|
||||
$this->applyDetailsToDocument();
|
||||
}
|
||||
}
|
||||
473
app/Analyzers/HtmlAnalyzer.php
Executable file
473
app/Analyzers/HtmlAnalyzer.php
Executable file
@@ -0,0 +1,473 @@
|
||||
<?php
|
||||
|
||||
namespace App\Analyzers;
|
||||
|
||||
use App\Models\Feed;
|
||||
use DomDocument;
|
||||
use DOMElement;
|
||||
use DOMXPath;
|
||||
use Elphin\IcoFileLoader\IcoFileService;
|
||||
use Illuminate\Support\Facades\Http;
|
||||
use SimplePie;
|
||||
use Storage;
|
||||
use Str;
|
||||
|
||||
/**
|
||||
* Extract information from a HTML file.
|
||||
*/
|
||||
class HtmlAnalyzer extends Analyzer
|
||||
{
|
||||
/**
|
||||
* Provides temporary access to DOM document to analyzers.
|
||||
*
|
||||
* @var DOMDocument
|
||||
*/
|
||||
private $domDocument;
|
||||
|
||||
/**
|
||||
* Provides temporary access to <meta> tags to analyzers.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $metaTags = [];
|
||||
|
||||
/**
|
||||
* Provides temporary access to <link> tags to analyzers.
|
||||
*
|
||||
* @var array
|
||||
*/
|
||||
private $linkTags = [];
|
||||
|
||||
/**
|
||||
* Analyzes document.
|
||||
*/
|
||||
public function analyze()
|
||||
{
|
||||
if (empty($this->body)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$this->createDomDocument();
|
||||
$this->findTitle();
|
||||
$this->findMetaTags();
|
||||
$this->findLinkTags();
|
||||
$this->findBestFavicon();
|
||||
$this->discoverFeeds();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a DOM document from document's body.
|
||||
*/
|
||||
protected function createDomDocument()
|
||||
{
|
||||
$this->body = mb_convert_encoding($this->body, 'HTML-ENTITIES', 'UTF-8');
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
|
||||
$this->domDocument = new DomDocument('1.0', 'UTF-8');
|
||||
|
||||
$this->domDocument->loadHtml($this->body);
|
||||
|
||||
libxml_clear_errors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find nodes corresponding to specified XPath query.
|
||||
*
|
||||
* @param string $xpathQuery
|
||||
*
|
||||
* @return DomNodeList
|
||||
*/
|
||||
protected function findNodes($xpathQuery)
|
||||
{
|
||||
$xpath = new DOMXPath($this->domDocument);
|
||||
|
||||
return $xpath->query($xpathQuery);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find first node corresponding to specified XPath query.
|
||||
*
|
||||
* @param string $xpathQuery
|
||||
*
|
||||
* @return DomNode
|
||||
*/
|
||||
protected function findFirstNode($xpathQuery)
|
||||
{
|
||||
$xpath = new DOMXPath($this->domDocument);
|
||||
$nodes = $xpath->query($xpathQuery);
|
||||
|
||||
if ($nodes->length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return $nodes->item(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover feeds for this document, store them and link them.
|
||||
*/
|
||||
protected function discoverFeeds()
|
||||
{
|
||||
$toSync = $this->document->feeds()->get()->pluck('id')->all();
|
||||
|
||||
$alternateLinks = data_get($this->linkTags, 'Alternate', []);
|
||||
|
||||
// Hard guessing some paths (.rss, ./rss. ./.rss for instance)
|
||||
$potentialNames = ['feed', 'rss', 'atom'];
|
||||
|
||||
foreach ($potentialNames as $potentialName) {
|
||||
$alternateLinks[] = [
|
||||
'type' => 'application/xml',
|
||||
'href' => sprintf('.%s', $potentialName),
|
||||
];
|
||||
|
||||
$alternateLinks[] = [
|
||||
'type' => 'application/xml',
|
||||
'href' => sprintf('./%s', $potentialName),
|
||||
];
|
||||
|
||||
$alternateLinks[] = [
|
||||
'type' => 'application/xml',
|
||||
'href' => sprintf('./.%s', $potentialName),
|
||||
];
|
||||
}
|
||||
|
||||
foreach ($alternateLinks as $alternateLink) {
|
||||
if (empty($alternateLink['type']) || !in_array($alternateLink['type'], config('cyca.feedTypes'))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
$url = \App\Helpers\Url::makeUrlAbsolute($this->response->effectiveUri(), $alternateLink['href']);
|
||||
} catch (\Exception $ex) {
|
||||
// Malformed URL
|
||||
continue;
|
||||
}
|
||||
|
||||
$client = new SimplePie();
|
||||
|
||||
$client->force_feed(true);
|
||||
$client->set_feed_url($url);
|
||||
$client->set_stupidly_fast(true);
|
||||
$client->enable_cache(false);
|
||||
|
||||
if (!$client->init()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$feed = Feed::firstOrCreate(['url' => $url]);
|
||||
|
||||
if (!in_array($feed->id, $toSync)) {
|
||||
$toSync[] = $feed->id;
|
||||
}
|
||||
}
|
||||
|
||||
$this->document->feeds()->sync($toSync);
|
||||
}
|
||||
|
||||
/**
|
||||
* Place in an array all attributes of a specific DOMElement.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function domElementToArray(DOMElement $node)
|
||||
{
|
||||
$data = [];
|
||||
|
||||
foreach ($node->attributes as $attribute) {
|
||||
$key = Str::slug($attribute->localName);
|
||||
$value = \App\Helpers\Cleaner::cleanupString($attribute->nodeValue);
|
||||
|
||||
$data[$key] = $value;
|
||||
}
|
||||
|
||||
return $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find document's title.
|
||||
*/
|
||||
private function findTitle()
|
||||
{
|
||||
$node = $this->findFirstNode('//head/title');
|
||||
|
||||
if (empty($node)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$this->document->title = \App\Helpers\Cleaner::cleanupString($node->nodeValue, true, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find and parse meta tags.
|
||||
*/
|
||||
private function findMetaTags()
|
||||
{
|
||||
$nodes = $this->findNodes('//head/meta');
|
||||
|
||||
$this->metaTags = [];
|
||||
|
||||
foreach ($nodes as $node) {
|
||||
$this->parseMetaTag($node);
|
||||
}
|
||||
|
||||
$this->metaTags = collect($this->metaTags)->sortKeys()->all();
|
||||
|
||||
//TODO: Format description
|
||||
$this->document->description = data_get($this->metaTags, 'meta.Description.content');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a meta tag and return a formated array.
|
||||
*/
|
||||
private function parseMetaTag(DOMElement $node)
|
||||
{
|
||||
$data = $this->domElementToArray($node);
|
||||
|
||||
if (empty($data)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$group = 'nonStandard';
|
||||
$name = null;
|
||||
|
||||
if (!empty($data['charset'])) {
|
||||
$group = 'charset';
|
||||
$name = 'charset';
|
||||
} elseif (!empty($data['name'])) {
|
||||
$group = 'meta';
|
||||
$name = $data['name'];
|
||||
|
||||
$data['originalName'] = $data['name'];
|
||||
|
||||
unset($data['name']);
|
||||
} elseif (!empty($data['property'])) {
|
||||
$group = 'properties';
|
||||
$name = $data['property'];
|
||||
|
||||
$data['originalName'] = $data['property'];
|
||||
|
||||
unset($data['property']);
|
||||
} elseif (!empty($data['http-equiv'])) {
|
||||
$group = 'pragma';
|
||||
$name = $data['http-equiv'];
|
||||
|
||||
$data['originalName'] = $data['http-equiv'];
|
||||
|
||||
unset($data['http-equiv']);
|
||||
}
|
||||
|
||||
$name = Str::studly(str_replace(':', '_', $name));
|
||||
|
||||
if (!empty($name)) {
|
||||
switch ($name) {
|
||||
// Handle specific meta tag formatting here
|
||||
default:
|
||||
$this->metaTags[$group][$name] = $data;
|
||||
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
$this->metaTags[$group][] = $data;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find and parse link tags.
|
||||
*/
|
||||
private function findLinkTags()
|
||||
{
|
||||
$nodes = $this->findNodes('//head/link');
|
||||
|
||||
$this->linkTags = [];
|
||||
|
||||
foreach ($nodes as $node) {
|
||||
$this->parseLinkTag($node);
|
||||
}
|
||||
|
||||
$this->linkTags = collect($this->linkTags)->sortKeys()->all();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a link tag and return a formated array.
|
||||
*
|
||||
* @return array
|
||||
*/
|
||||
private function parseLinkTag(DOMElement $node)
|
||||
{
|
||||
$data = $this->domElementToArray($node);
|
||||
|
||||
if (empty($data)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$group = 'Others';
|
||||
|
||||
if (!empty($data['rel'])) {
|
||||
$group = Str::studly($data['rel']);
|
||||
}
|
||||
|
||||
$this->linkTags[$group][] = $data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all link tags marked as being a favicon, then determine which one
|
||||
* is best suited to be the one.
|
||||
*/
|
||||
private function findBestFavicon()
|
||||
{
|
||||
$defaultFaviconUrl = \App\Helpers\Url::makeUrlAbsolute($this->response->effectiveUri(), '/favicon.ico');
|
||||
$potentialIcons = [];
|
||||
|
||||
$links = $this->linkTags;
|
||||
|
||||
foreach ($links as $group => $tags) {
|
||||
foreach ($tags as $tag) {
|
||||
if (!empty($tag['rel']) && in_array($tag['rel'], config('cyca.faviconRels'))) {
|
||||
$potentialIcons[] = $tag['href'];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$potentialIcons[] = $defaultFaviconUrl;
|
||||
|
||||
$topWidth = 0;
|
||||
$selectedIcon = null;
|
||||
|
||||
foreach ($potentialIcons as $potentialIcon) {
|
||||
$url = \App\Helpers\Url::makeUrlAbsolute($this->response->effectiveUri(), $potentialIcon);
|
||||
|
||||
try {
|
||||
$response = Http::timeout(10)->get($url);
|
||||
} catch (\Exception $ex) {
|
||||
report($ex);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!$response->ok()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$body = $response->body();
|
||||
$filePath = sprintf('%s/favicon_%s', $this->document->getStoragePath(), md5($body));
|
||||
|
||||
Storage::put($filePath, $body);
|
||||
|
||||
$mimetype = Storage::mimetype($filePath);
|
||||
|
||||
if (!$this->isValidFavicon($body, $mimetype)) {
|
||||
Storage::delete($filePath);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
$width = $this->getImageWidth($body, $mimetype);
|
||||
|
||||
if ($width >= $topWidth) {
|
||||
$topWidth = $width;
|
||||
$selectedIcon = $filePath;
|
||||
} else {
|
||||
Storage::delete($filePath);
|
||||
}
|
||||
}
|
||||
|
||||
if (!empty($selectedIcon)) {
|
||||
$this->document->favicon_path = $selectedIcon;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if favicon has a valid mime type.
|
||||
*
|
||||
* @param string $mimetype
|
||||
* @param mixed $body
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function isValidFavicon($body, $mimetype)
|
||||
{
|
||||
if (!in_array($mimetype, config('cyca.faviconTypes'))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return $this->isValidImage($body, $mimetype);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if favicon is a valid image. Mime type is used to adjust tests.
|
||||
*
|
||||
* @param string $mimeType
|
||||
* @param mixed $body
|
||||
*
|
||||
* @return bool
|
||||
*/
|
||||
private function isValidImage($body, $mimeType)
|
||||
{
|
||||
switch ($mimeType) {
|
||||
case 'image/x-icon':
|
||||
case 'image/vnd.microsoft.icon':
|
||||
$loader = new IcoFileService();
|
||||
|
||||
try {
|
||||
$loader->extractIcon($body, 16, 16);
|
||||
} catch (\Exception $ex) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
case 'image/svg':
|
||||
case 'image/svg+xml':
|
||||
$im = new Imagick();
|
||||
|
||||
try {
|
||||
$im->readImageBlob($body);
|
||||
} catch (\Exception $ex) {
|
||||
$im->destroy();
|
||||
|
||||
report($ex);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
default:
|
||||
$res = @imagecreatefromstring($body);
|
||||
|
||||
if (!$res) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain width of image.
|
||||
*
|
||||
* @param string $mimeType
|
||||
* @param mixed $body
|
||||
*
|
||||
* @return int
|
||||
*/
|
||||
private function getImageWidth($body, $mimeType)
|
||||
{
|
||||
switch ($mimeType) {
|
||||
case 'image/x-icon':
|
||||
case 'image/vnd.microsoft.icon':
|
||||
return 16;
|
||||
case 'image/svg':
|
||||
case 'image/svg+xml':
|
||||
return 1024;
|
||||
default:
|
||||
$infos = @getimagesizefromstring($body);
|
||||
|
||||
if (!$infos) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return $infos[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
89
app/Analyzers/PdfAnalyzer.php
Executable file
89
app/Analyzers/PdfAnalyzer.php
Executable file
@@ -0,0 +1,89 @@
|
||||
<?php
|
||||
|
||||
namespace App\Analyzers;
|
||||
|
||||
use Smalot\PdfParser\Parser;
|
||||
|
||||
/**
|
||||
* Extract information from a PDF file.
|
||||
*/
|
||||
class PdfAnalyzer extends Analyzer
|
||||
{
|
||||
/**
|
||||
* PDF parser.
|
||||
*
|
||||
* @var \Smalot\PdfParser\Parser
|
||||
*/
|
||||
protected $parser;
|
||||
|
||||
/**
|
||||
* Analyzes document.
|
||||
*/
|
||||
public function analyze()
|
||||
{
|
||||
if (empty($this->body)) {
|
||||
return;
|
||||
}
|
||||
|
||||
$this->extractDetails();
|
||||
$this->storeDetailsOnDisk();
|
||||
$this->applyDetailsToDocument();
|
||||
|
||||
$this->document->description = (string) view('partials.details.pdf')->with([
|
||||
'details' => $this->details,
|
||||
'url' => asset(str_replace('public', 'storage', $this->document->getStoragePath()).'/body'),
|
||||
]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Store some details in database. This method uses an array to map document
|
||||
* properties to metadata properties.
|
||||
*
|
||||
* @param mixed $mappings
|
||||
*/
|
||||
protected function applyDetailsToDocument($mappings = [])
|
||||
{
|
||||
$mappings = [
|
||||
'title' => 'Title',
|
||||
];
|
||||
|
||||
parent::applyDetailsToDocument($mappings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an instance of PDF parser.
|
||||
*
|
||||
* @return \Smalot\PdfParser\Parser
|
||||
*/
|
||||
private function getParser()
|
||||
{
|
||||
if (!$this->parser) {
|
||||
$this->parser = new Parser();
|
||||
}
|
||||
|
||||
return $this->parser;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse PDF content.
|
||||
*
|
||||
* @param mixed $content
|
||||
*/
|
||||
private function parseContent($content)
|
||||
{
|
||||
$parser = $this->getParser();
|
||||
|
||||
return $parser->parseContent($content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array of meta data included in PDF.
|
||||
*/
|
||||
private function extractDetails()
|
||||
{
|
||||
$data = $this->parseContent($this->body);
|
||||
$this->details = $data->getDetails();
|
||||
|
||||
return $this->details;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user