Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding pdftotext for PDF support in Hypercube #77

Merged
merged 3 commits into from
Sep 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Hypercube/cfg/config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

hypercube:
# path to the convert executable
executable: tesseract
tesseract_executable: tesseract
pdftotext_executable: pdftotext

fedora_resource:
base_url: http://localhost:8080/fcrepo/rest
Expand All @@ -20,4 +21,4 @@ syn:
# Path to the syn config file for authentication.
# example can be found here:
# https://github.com/Islandora-CLAW/Syn/blob/master/conf/syn-settings.example.xml
config: ../syn-settings.xml
config: ../syn-settings.xml
44 changes: 37 additions & 7 deletions Hypercube/src/Controller/HypercubeController.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use GuzzleHttp\Psr7\StreamWrapper;
use Islandora\Crayfish\Commons\CmdExecuteService;
use Monolog\Logger;
use Psr\Http\Message\ResponseInterface;
use Symfony\Component\HttpFoundation\BinaryFileResponse;
use Symfony\Component\HttpFoundation\Request;
Expand All @@ -25,17 +26,35 @@ class HypercubeController
/**
* @var string
*/
protected $executable;
protected $tesseract_executable;

/**
* @var string
*/
protected $pdftotext_executable;

/**
* @var \Monolog\Logger
*/
protected $log;

/**
* HypercubeController constructor.
* @param \Islandora\Crayfish\Commons\CmdExecuteService $cmd
* @param string $executable
* @param string $tesseract_executable
* @param string $pdftotext_executable
* @param $log
*/
public function __construct(CmdExecuteService $cmd, $executable)
{
public function __construct(
CmdExecuteService $cmd,
$tesseract_executable,
$pdftotext_executable,
Logger $log
) {
$this->cmd = $cmd;
$this->executable = $executable;
$this->tesseract_executable = $tesseract_executable;
$this->pdftotext_executable = $pdftotext_executable;
$this->log = $log;
}

/**
Expand All @@ -50,10 +69,21 @@ public function get(Request $request)
// Get tiff as a resource.
$body = StreamWrapper::getResource($fedora_resource->getBody());

// Arguments to OCR command are sent as a custom header
// Arguments to command line are sent as a custom header
$args = $request->headers->get('X-Islandora-Args');

$cmd_string = $this->executable . ' stdin stdout ' . $args;
// Check content type and use the appropriate command line tool.
$content_type = $fedora_resource->getHeader('Content-Type')[0];

$this->log->debug("Got Content-Type:", ['type' => $content_type]);

if ($content_type == 'application/pdf') {
$cmd_string = $this->pdftotext_executable . " $args - -";
} else {
$cmd_string = $this->tesseract_executable . " stdin stdout $args";
}

$this->log->debug("Executing command:", ['cmd' => $cmd_string]);

// Return response.
try {
Expand Down
4 changes: 3 additions & 1 deletion Hypercube/src/app.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
$app['hypercube.controller'] = function ($app) {
return new HypercubeController(
$app['crayfish.cmd_execute_service'],
$app['crayfish.hypercube.executable']
$app['crayfish.hypercube.tesseract_executable'],
$app['crayfish.hypercube.pdftotext_executable'],
$app['monolog']
);
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use Islandora\Crayfish\Commons\CmdExecuteService;
use Islandora\Hypercube\Controller\HypercubeController;
use Monolog\Logger;
use Prophecy\Argument;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\StreamInterface;
Expand All @@ -21,9 +22,12 @@ class HypercubeControllerTest extends \PHPUnit_Framework_TestCase
public function testOptions()
{
$mock_service = $this->prophesize(CmdExecuteService::class)->reveal();
$mock_logger = $this->prophesize(Logger::class)->reveal();
$controller = new HypercubeController(
$mock_service,
''
'tesseract',
'pdftotext',
$mock_logger
);

$response = $controller->options();
Expand All @@ -39,12 +43,27 @@ public function testOptions()
* @covers ::get
*/
public function testTesseractErrorReturns500()
{
$this->errorReturns500('image/tiff');
}

/**
* @covers ::__construct
* @covers ::get
*/
public function testPdfToTextErrorReturns500()
{
$this->errorReturns500('application/pdf');
}

protected function errorReturns500($mimetype)
{
// Mock a TesseractService to create a controller.
$prophecy = $this->prophesize(CmdExecuteService::class);
$prophecy->execute(Argument::any(), Argument::any())->willThrow(new \RuntimeException("ERROR", 500));
$mock_service = $prophecy->reveal();
$controller = new HypercubeController($mock_service, '');
$mock_logger = $this->prophesize(Logger::class)->reveal();
$controller = new HypercubeController($mock_service, 'tesseract', 'pdftotext', $mock_logger);

// Mock a stream body for a Fedora response.
$prophecy = $this->prophesize(StreamInterface::class);
Expand All @@ -54,6 +73,7 @@ public function testTesseractErrorReturns500()

// Mock a Fedora response.
$prophecy = $this->prophesize(ResponseInterface::class);
$prophecy->getHeader('Content-Type')->willReturn(['image/tiff']);
$prophecy->getStatusCode()->willReturn(200);
$prophecy->getBody()->willReturn($mock_stream);
$mock_fedora_response = $prophecy->reveal();
Expand All @@ -78,10 +98,25 @@ public function testTesseractErrorReturns500()
*/
public function testTesseractSuccessReturns200()
{
// Mock a TesseractService to create a controller.
$this->successReturns200('image/tiff');
}

/**
* @covers ::__construct
* @covers ::get
*/
public function testPdfToTextSuccessReturns200()
{
$this->successReturns200('application/pdf');
}

protected function successReturns200($mimetype)
{
// Mock a controller.
$prophecy = $this->prophesize(CmdExecuteService::class);
$mock_service = $prophecy->reveal();
$controller = new HypercubeController($mock_service, '');
$mock_logger = $this->prophesize(Logger::class)->reveal();
$controller = new HypercubeController($mock_service, 'tesseract', 'pdftotext', $mock_logger);

// Mock a stream body for a Fedora response.
$prophecy = $this->prophesize(StreamInterface::class);
Expand All @@ -91,6 +126,7 @@ public function testTesseractSuccessReturns200()

// Mock a Fedora response.
$prophecy = $this->prophesize(ResponseInterface::class);
$prophecy->getHeader('Content-Type')->willReturn([$mimetype]);
$prophecy->getStatusCode()->willReturn(200);
$prophecy->getBody()->willReturn($mock_stream);
$mock_fedora_response = $prophecy->reveal();
Expand All @@ -104,6 +140,7 @@ public function testTesseractSuccessReturns200()
$request->headers->set('ApixLdpResource', 'http://localhost:8080/fcrepo/rest/foo');
$request->attributes->set('fedora_resource', $mock_fedora_response);

// Check success.
$response = $controller->get($request);
$this->assertTrue($response->getStatusCode() == 200, "Response must return 200");
}
Expand Down