As you may be aware from previous threads on this group from my co-worker Vicky Philips, we at the National Library of Wales have been having some issues with the generation of EAD and DC from the web interface bogging our web server down. We looked to cache the entire atom system's EAD and DC offline instead, however when trying this with the arCacheDescriptionXmlTask we found that the entire process would take around 6 months to complete!
Thanks to comments on here regarding chunking the export with the addition of a --skip flag to the above script, we were able to improve this, but due to the size of our archive (~18000 top level archival collections, ~800000 slugs) we still ran into serious memory issues. Most of this we put down to the ORM being inefficient. At one point we logged around 30,000 SQL select queries on a single EAD generation request.
The chunking got me thinking, and we decided to modify arCacheDescriptionXmlTask to take a --slug param. Then by generating a list of all of the slugs in atom with a simple
This could be wrapped up into a simple PHP script but we didn't need to at the time
Then using gnu parallel and a VM with a few (24) CPU cores we were able to parallel process this input list
class arCacheDescriptionXmlTask extends arBaseTask
{
protected function configure()
{
$this->addOptions(array(
new sfCommandOption('application', null, sfCommandOption::PARAMETER_OPTIONAL, 'The application name', 'qubit'),
new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'cli'),
new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'propel'),
new sfCommandOption('skip', null, sfCommandOption::PARAMETER_OPTIONAL, 'Number of information objects to skip', 0),
new sfCommandOption('slug', null, sfCommandOption::PARAMETER_OPTIONAL, 'Slug of resource', 0)
));
$this->namespace = 'cache';
$this->name = 'xml-representations';
$this->briefDescription = 'Render all descriptions as XML and cache the results as files';
$this->detailedDescription = <<<EOF
Render all descriptions as XML and cache the results as files
EOF;
}
public function execute($arguments = array(), $options = array())
{
parent::execute($arguments, $options);
if ($options['slug']) {
$this->export($options['slug']);
} else {
$this->exportAll($options);
}
}
private function exportAll($options)
{
$logger = new sfCommandLogger(new sfEventDispatcher);
$logger->log('Caching XML representations of information objects...');
$cache = new QubitInformationObjectXmlCache(array('logger' => $logger));
$cache->exportAll(array('skip' => $options['skip']));
$logger->log('Done.');
}
private function export($slug)
{
$obj = QubitObject::getBySlug($slug);
$logger = new sfCommandLogger(new sfEventDispatcher);
$logger->log("Caching XML representation of resource {$slug}");
$cache = new QubitInformationObjectXmlCache(array('logger' => $logger));
$cache->export($obj);
$logger->log('Done.');
}
}