Source code for sarracenia.flowcb.accept.tolocal

"""
Plugin tolocal.py:
    This is a helper script to work with filters.
    What a data pump advertises, it will usually use Web URL, but if one is
    on a server where the files are available, it is more efficient to access 
    them as local files, so filters operate on file urls.  
    
Example:
    baseDir /var/www/html
    url is http://localhost/<date>/<src>/input/file.txt

    flowcb sarracenia.flowcb.accept.tolocal.ToLocal   # converts web URL to file URL

            http://localhost/ --> file://var/www/html/
            url is now file://var/www/html/<date>/<src>/input/file.txt
            m.savedurl = http://localhost/

    flowcb sarracenia.flowcb.accept.<some converter that works on local files.>

            A new file is created in another directory.
            url is now file://var/www/<date>/<src>/output/file.txt

    flowcb sarracenia.flowcb.accept.tohttp.ToHttp     # turns the file URL back into a web one.

            file://var/www/html/ --> http:///localhost/
            url is now:   http://localhost/<date>/<src>/output/file.txt


    The regular expression used to find the web url matches either http or https
    and just captures upto the first '/'.

    if you need to capture a different kind of url, such as ...

    https://hostname/~user/ ....

    The easiest way is to set toLocalUrl as follows:

    baseDir /home/user/www
    toLocalUrl (https://hostname/~user/)

    the parentheses around the URL set the value of to be put in m.savedurl that
    will be restored when the companion plugin msg_2http is called.

Usage: 
    flowcb sarracenia.flowcb.accept.tolocal.ToLocal
"""
import logging
import re
from sarracenia.flowcb import FlowCB
logger = logging.getLogger(__name__)


[docs] class ToLocal(FlowCB):
[docs] def __init__(self, options): super().__init__(options,logger) self._ldocroot = None if self.o.baseDir: self._ldocroot = self.o.baseDir self.o.add_option('toLocalRoot', 'str') if self.o.toLocalRoot: self._ldocroot = self.o.toLocalRoot self._lurlre = re.compile("(http[s]{0,1}://[^/]+/)") self.o.add_option('toLocalUrl', 'str') if self.o.toLocalUrl: self._lurlre = re.compile(self.o.toLocalUrl)
def after_accept(self, worklist): for message in worklist.incoming: # TODO should all these be logger.error? should we append # to worklist.rejected or worklist.failed at some point? logger.debug("input: urlstr: %s" % message['urlstr']) message['savedurl'] = self._lurlre.match(message['urlstr']).group(1) message['urlstr'] = 'file:/%s' % self._lurlre.sub(self._ldocroot + '/', message['urlstr']) logger.debug("doc_root=%s " % (self.o.baseDir)) logger.debug("output: savedurl: %s" % (message['savedurl']))