Source code for sarracenia.transfer.https

# This file is part of sarracenia.
# The sarracenia suite is Free and is proudly provided by the Government of Canada
# Copyright (C) Her Majesty The Queen in Right of Canada, 2008-2021
#
# Sarracenia repository: https://github.com/MetPX/sarracenia
# Documentation: https://github.com/MetPX/sarracenia
#
########################################################################
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; version 2 of the License.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
#
#

import logging
import os
import sarracenia
import ssl
import subprocess
import sys

from sarracenia.transfer import Transfer
from sarracenia.transfer import alarm_cancel, alarm_set, alarm_raise

import urllib.error, urllib.parse, urllib.request
from urllib.parse import unquote

logger = logging.getLogger(__name__)


[docs] class Https(Transfer): """ HyperText Transfer Protocol (HTTP) ( https://en.wikipedia.org/wiki/Hypertext_Transfer_Protocol ) sarracenia transfer protocol subclass supports/uses additional custom options: * accelWgetCommand (default: '/usr/bin/wget %s -o - -O %d' ) built with: urllib.request ( https://docs.python.org/3/library/urllib.request.html ) """
[docs] def __init__(self, proto, options): super().__init__(proto, options) self.o.add_option('accelWgetCommand', 'str', '/usr/bin/wget %s -o - -O %d') logger.debug("sr_http __init__") self.tlsctx = ssl.create_default_context() if hasattr(self.o, 'tlsRigour'): self.o.tlsRigour = self.o.tlsRigour.lower() if self.o.tlsRigour == 'lax': self.tlsctx = ssl.create_default_context() self.tlsctx.check_hostname = False self.tlsctx.verify_mode = ssl.CERT_NONE elif self.o.tlsRigour == 'strict': self.tlsctx = ssl.SSLContext(ssl.PROTOCOL_TLS) self.tlsctx.options |= ssl.OP_NO_TLSv1 self.tlsctx.options |= ssl.OP_NO_TLSv1_1 self.tlsctx.check_hostname = True self.tlsctx.verify_mode = ssl.CERT_REQUIRED self.tlsctx.load_default_certs() # TODO Find a way to reintroduce certificate revocation (CRL) in the future # self.tlsctx.verify_flags = ssl.VERIFY_CRL_CHECK_CHAIN # https://github.com/MetPX/sarracenia/issues/330 elif self.o.tlsRigour == 'normal': pass else: logger.warning( "option tlsRigour must be one of: lax, normal, strict") self.init()
def registered_as(): return ['http', 'https'] # cd def cd(self, path): logger.debug("sr_http cd %s" % path) self.cwd = os.path.dirname(path) self.path = path # for compatibility... always new connection with http def check_is_connected(self): logger.debug("sr_http check_is_connected") if not self.connected : return False if self.sendTo != self.o.sendTo: self.close() return False return True # close def close(self): logger.debug("sr_http close") self.init() # connect... def connect(self): logger.debug("sr_http connect %s" % self.o.sendTo) if self.connected: self.close() self.connected = False self.sendTo = self.o.sendTo self.timeout = self.o.timeout if not self.credentials(): return False return True # credentials... def credentials(self): logger.debug("sr_http credentials %s" % self.sendTo) try: ok, details = self.o.credentials.get(self.sendTo) if details: url = details.url self.user = url.username if url.username != '' else None self.password = url.password if url.password != '' else None self.bearer_token = details.bearer_token if hasattr( details, 'bearer_token') else None return True except: logger.error( "sr_http/credentials: unable to get credentials for %s" % self.sendTo) logger.debug('Exception details: ', exc_info=True) return False # get def get(self, msg, remote_file, local_file, remote_offset=0, local_offset=0, length=0, exactLength=False): logger.debug("get %s %s %d" % (remote_file, local_file, local_offset)) logger.debug("sr_http self.path %s" % self.path) # open self.http if 'retrievePath' in msg: url = self.sendTo + '/' + msg['retrievePath'] else: u = urllib.parse.urlparse( self.sendTo ) url = u.scheme + '://' + u.netloc + '/' + urllib.parse.quote(self.path + '/' + remote_file, safe='/+') ok = self.__open__(url, remote_offset, length) if not ok: return False # read from self.http write to local_file rw_length = self.read_writelocal(remote_file, self.http, local_file, local_offset, length, exactLength) return rw_length def getAccelerated(self, msg, remote_file, local_file, length, remote_offset=0, exactLength=False ): arg1 = msg['baseUrl'] + '/' + msg['relPath'] arg1 = arg1.replace(' ', '\\ ') arg2 = local_file cmd = self.o.accelWgetCommand.replace('%s', arg1) cmd = cmd.replace('%d', arg2).split() if exactLength: cmd = [cmd[0]] + [ f"--header=Range: bytes={remote_offset}-{length-1}" ] + cmd[1:] else: cmd = [cmd[0]] + cmd[1:] logger.info("accel_wget: %s" % ' '.join(cmd)) p = subprocess.Popen(cmd) p.wait() if p.returncode != 0: logger.warning("binary accelerator %s returned: %d" % ( cmd, p.returncode ) ) return -1 # FIXME: length is not validated. return length # init def init(self): Transfer.init(self) logger.debug("sr_http init") self.connected = False self.http = None self.details = None self.seek = True self.urlstr = '' self.path = '' self.cwd = '' self.data = '' self.entries = {} # ls def ls(self): logger.debug("sr_http ls") # open self.http self.entries = {} url = self.sendTo + '/' + urllib.parse.quote(self.path, safe='/+') ok = self.__open__(url) if not ok: return self.entries # get html page for directory try: dbuf = None while True: alarm_set(self.o.timeout) chunk = self.http.read(self.o.bufsize) alarm_cancel() if not chunk: break if dbuf: dbuf += chunk else: dbuf = chunk #self.data = dbuf.decode('utf-8') # invoke option defined on_html_page ... if any #for plugin in self.o.on_html_page_list: # if not plugin(self): # logger.warning("something wrong") # return self.entries except: logger.warning("sr_http/ls: unable to open %s" % self.urlstr) logger.debug('Exception details: ', exc_info=True) return dbuf # open def __open__(self, path, remote_offset=0, length=0): logger.debug( f"{path}") self.http = None self.connected = False self.req = None self.urlstr = path # have noticed that some site does not allow // in path if path.startswith('http://') and '//' in path[7:]: self.urlstr = 'http://' + path[7:].replace('//', '/') if path.startswith('https://') and '//' in path[8:]: self.urlstr = 'https://' + path[8:].replace('//', '/') alarm_set(self.o.timeout) try: # when credentials are needed. headers = {'user-agent': 'Sarracenia ' + sarracenia.__version__} if self.bearer_token: logger.debug('bearer_token: %s' % self.bearer_token) headers['Authorization'] = 'Bearer ' + self.bearer_token if self.user != None: password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm() # takeaway credentials info from urlstr cred = self.user + '@' self.urlstr = self.urlstr.replace(cred, '') if self.password != None: cred = self.user + ':' + self.password + '@' self.urlstr = self.urlstr.replace(cred, '') # continue with authentication password_mgr.add_password(None, self.urlstr, self.user, unquote(self.password)) auth_handler = urllib.request.HTTPBasicAuthHandler( password_mgr) #hctx = ssl.create_default_context() #hctx.check_hostname = False #hctx.verify_mode = ssl.CERT_NONE ssl_handler = urllib.request.HTTPSHandler(0, self.tlsctx) # create "opener" (OpenerDirector instance) opener = urllib.request.build_opener(auth_handler, ssl_handler) # use the opener to fetch a URL opener.open(self.urlstr) # Install the opener. urllib.request.install_opener(opener) # Now all calls to get the request use our opener. self.req = urllib.request.Request(self.urlstr, headers=headers) # set range in byte if needed if remote_offset != 0: str_range = 'bytes=%d-%d' % (remote_offset, remote_offset + length - 1) self.req.headers['Range'] = str_range # https without user : create/use an ssl context ctx = None if self.user == None and self.urlstr.startswith('https'): ctx = self.tlsctx #ctx.check_hostname = False #ctx.verify_mode = ssl.CERT_NONE # open... we are connected if self.timeout == None: self.http = urllib.request.urlopen(self.req, context=ctx) else: self.http = urllib.request.urlopen(self.req, timeout=self.timeout, context=ctx) self.connected = True alarm_cancel() return True except urllib.error.HTTPError as e: logger.error('Download failed 4 %s ' % self.urlstr) logger.error( 'Server couldn\'t fulfill the request. Error code: %s, %s' % (e.code, e.reason)) alarm_cancel() self.connected = False raise except urllib.error.URLError as e: logger.error('Download failed 5 %s ' % self.urlstr) logger.error('Failed to reach server. Reason: %s' % e.reason) alarm_cancel() self.connected = False raise except: logger.warning("unable to open %s" % self.urlstr) logger.debug('Exception details: ', exc_info=True) self.connected = False alarm_cancel() raise alarm_cancel() return False