Module pachyderm_sdk.api.cdr.resolver
Handwritten classes/methods that augment the existing CDR API.
Expand source code
"""Handwritten classes/methods that augment the existing CDR API."""
import os
import gzip
from hashlib import blake2b
from hmac import compare_digest
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
from betterproto import which_one_of
from . import (
Ref,
Cipher,
Compress,
Concat,
ContentHash,
Http,
SizeLimits,
Slice,
CipherAlgo,
CompressAlgo,
HashAlgo,
)
try:
import requests
from Crypto.Cipher import ChaCha20
CDR_ENABLED = True
except ImportError:
requests = None
ChaCha20 = None
CDR_ENABLED = False
class CdrResolver:
"""Class capable of resolving CDRs returned by the PFS API."""
def __init__(
self,
*,
cache_location: Optional[os.PathLike] = None,
fetch_missing_chunks: bool = True,
http_host_replacement: str = "",
):
"""Creates a CdrResolver.
Whether CDR functionality is enabled is checked at time of initialization.
Parameters
----------
http_host_replacement : str
The value of this parameter replaces the host (including port) within
the presigned URLs when resolving CDRs. This configuration is useful
if, for some reason, the URL that pachd uses to interact with object
storage is different from the one that you use. For example, if your
Pachyderm object storage is running within your kubernetes cluster and
pachd is configured to use a URL which is only valid in cluster.
This may fix any issues, depending on the object storage and how they
generate presigned URLs.
Example: localhost:9000
"""
if not CDR_ENABLED:
raise RuntimeError(
f"CDR functionality is not enabled for this installation "
f"of the pachyderm_sdk package. \n"
f"To enable CDR functionality, reinstall package with: \n"
f" - pip install pachyderm_sdk[cdr]"
)
self.cache = cache_location
if self.cache is not None:
self.cache = Path(os.path.expanduser(self.cache)).resolve()
self.cache.mkdir(parents=True, exist_ok=True)
self.fetch_missing_chunks = fetch_missing_chunks
self.http_host_replacement = http_host_replacement
def resolve(self, ref: Ref) -> bytes:
"""Resolve a CDR reference."""
field, body = which_one_of(ref, "body")
if isinstance(body, Http):
return self._dereference_http(body)
elif isinstance(body, Cipher):
return self._dereference_cipher(body)
elif isinstance(body, Compress):
return self._dereference_compress(body)
elif isinstance(body, ContentHash):
return self._dereference_content_hash(body)
elif isinstance(body, SizeLimits):
return self._dereference_size_limits(body)
elif isinstance(body, Concat):
return b"".join(map(self.resolve, body.refs))
elif isinstance(body, Slice):
return self.resolve(body.inner)[body.start : body.end]
else:
raise ValueError(f"unsupported Ref variant: {body}")
def _dereference_http(self, body: Http) -> bytes:
"""Resolves an HTTP CDR. This means retrieving the data from object
storage using a presigned URL.
The HTTP CDR is the "bottom" of the CDR structure and therefore resolution
does not recurse any deeper.
If http_host_replacement was configured on the class, this substitution
is made here. Additionally the Host header will be overwritten with the
original host of the presigned URL.
"""
url, headers = body.url, body.headers
if self.http_host_replacement:
parsed_url = urlparse(body.url)
headers["Host"] = parsed_url.netloc
url = parsed_url._replace(netloc=self.http_host_replacement).geturl()
response = requests.get(url=url, headers=headers)
try:
response.raise_for_status()
except requests.HTTPError as err:
text = err.response.text
if text:
# If there's a response, log it in the error chain.
raise requests.HTTPError(
f"Error {err.response.status_code} - HTTP response: {text}"
) from err
raise err
return response.content
def _dereference_cipher(self, body: Cipher) -> bytes:
"""Resolves a Cipher CDR. This method must resolve its inner CDR."""
if body.algo != CipherAlgo.CHACHA20:
raise ValueError(f"unrecognized cipher algorithm: {body.algo}")
inner = self.resolve(body.inner)
cipher = ChaCha20.new(key=body.key, nonce=body.nonce)
return cipher.decrypt(inner)
def _dereference_compress(self, body: Compress) -> bytes:
"""Resolves a Compress CDR. This method must resolve its inner CDR."""
if body.algo != CompressAlgo.GZIP:
raise ValueError(f"unrecognized compress algorithm: {body.algo}")
inner = self.resolve(body.inner)
return gzip.decompress(inner)
def _dereference_content_hash(self, body: ContentHash) -> bytes:
"""Resolves a ContentHash CDR. This method must resolve its inner CDR.
If the cache_location has been set:
* the content will be read from disk if the content exists within the cache,
rather than resolving the HTTP reference.
* the content will be saved to the cache after resolving the HTTP reference.
* the cache uses the hex string of the content hash as its key (file name).
"""
def _deref_inner() -> bytes:
if body.algo != HashAlgo.BLAKE2b_256:
raise ValueError(f"unrecognized hash algorithm: {body.algo}")
inner = self.resolve(body.inner)
inner_hash = blake2b(inner, digest_size=32).digest()
if not compare_digest(inner_hash, body.hash):
raise ValueError(
f"content failed hash check. HAVE: {inner_hash} WANT: {body.hash}"
)
return inner
if not self.cache:
return _deref_inner()
chunk_file = self.cache.joinpath(self._chunk_name(body))
if chunk_file.exists():
return chunk_file.read_bytes()
if not chunk_file.exists() and self.fetch_missing_chunks:
content = _deref_inner()
chunk_file.write_bytes(content)
return content
raise FileNotFoundError(f"chunk missing from cache: {chunk_file}")
def _dereference_size_limits(self, body: SizeLimits) -> bytes:
"""Resolves a SizeLimits CDR. This method must resolve its inner CDR."""
inner = self.resolve(body.inner)
if body.min and len(inner) < body.min:
raise ValueError(
f"content failed minimum size check. "
f"HAVE: {len(inner)} bytes "
f"WANT: {body.min} bytes "
)
if body.max and len(inner) > body.max:
raise ValueError(
f"content failed minimum size check. "
f"HAVE: {len(inner)} bytes "
f"WANT: {body.max} bytes "
)
return inner
@staticmethod
def _chunk_name(content_hash: ContentHash) -> str:
algorith = HashAlgo(content_hash.algo)
return f"{algorith.name.lower()}_{content_hash.hash.hex()}"
Classes
class CdrResolver (*, cache_location: Optional[os.PathLike] = None, fetch_missing_chunks: bool = True, http_host_replacement: str = '')
-
Class capable of resolving CDRs returned by the PFS API.
Creates a CdrResolver. Whether CDR functionality is enabled is checked at time of initialization.
Parameters
http_host_replacement
:str
- The value of this parameter replaces the host (including port) within the presigned URLs when resolving CDRs. This configuration is useful if, for some reason, the URL that pachd uses to interact with object storage is different from the one that you use. For example, if your Pachyderm object storage is running within your kubernetes cluster and pachd is configured to use a URL which is only valid in cluster. This may fix any issues, depending on the object storage and how they generate presigned URLs. Example: localhost:9000
Expand source code
class CdrResolver: """Class capable of resolving CDRs returned by the PFS API.""" def __init__( self, *, cache_location: Optional[os.PathLike] = None, fetch_missing_chunks: bool = True, http_host_replacement: str = "", ): """Creates a CdrResolver. Whether CDR functionality is enabled is checked at time of initialization. Parameters ---------- http_host_replacement : str The value of this parameter replaces the host (including port) within the presigned URLs when resolving CDRs. This configuration is useful if, for some reason, the URL that pachd uses to interact with object storage is different from the one that you use. For example, if your Pachyderm object storage is running within your kubernetes cluster and pachd is configured to use a URL which is only valid in cluster. This may fix any issues, depending on the object storage and how they generate presigned URLs. Example: localhost:9000 """ if not CDR_ENABLED: raise RuntimeError( f"CDR functionality is not enabled for this installation " f"of the pachyderm_sdk package. \n" f"To enable CDR functionality, reinstall package with: \n" f" - pip install pachyderm_sdk[cdr]" ) self.cache = cache_location if self.cache is not None: self.cache = Path(os.path.expanduser(self.cache)).resolve() self.cache.mkdir(parents=True, exist_ok=True) self.fetch_missing_chunks = fetch_missing_chunks self.http_host_replacement = http_host_replacement def resolve(self, ref: Ref) -> bytes: """Resolve a CDR reference.""" field, body = which_one_of(ref, "body") if isinstance(body, Http): return self._dereference_http(body) elif isinstance(body, Cipher): return self._dereference_cipher(body) elif isinstance(body, Compress): return self._dereference_compress(body) elif isinstance(body, ContentHash): return self._dereference_content_hash(body) elif isinstance(body, SizeLimits): return self._dereference_size_limits(body) elif isinstance(body, Concat): return b"".join(map(self.resolve, body.refs)) elif isinstance(body, Slice): return self.resolve(body.inner)[body.start : body.end] else: raise ValueError(f"unsupported Ref variant: {body}") def _dereference_http(self, body: Http) -> bytes: """Resolves an HTTP CDR. This means retrieving the data from object storage using a presigned URL. The HTTP CDR is the "bottom" of the CDR structure and therefore resolution does not recurse any deeper. If http_host_replacement was configured on the class, this substitution is made here. Additionally the Host header will be overwritten with the original host of the presigned URL. """ url, headers = body.url, body.headers if self.http_host_replacement: parsed_url = urlparse(body.url) headers["Host"] = parsed_url.netloc url = parsed_url._replace(netloc=self.http_host_replacement).geturl() response = requests.get(url=url, headers=headers) try: response.raise_for_status() except requests.HTTPError as err: text = err.response.text if text: # If there's a response, log it in the error chain. raise requests.HTTPError( f"Error {err.response.status_code} - HTTP response: {text}" ) from err raise err return response.content def _dereference_cipher(self, body: Cipher) -> bytes: """Resolves a Cipher CDR. This method must resolve its inner CDR.""" if body.algo != CipherAlgo.CHACHA20: raise ValueError(f"unrecognized cipher algorithm: {body.algo}") inner = self.resolve(body.inner) cipher = ChaCha20.new(key=body.key, nonce=body.nonce) return cipher.decrypt(inner) def _dereference_compress(self, body: Compress) -> bytes: """Resolves a Compress CDR. This method must resolve its inner CDR.""" if body.algo != CompressAlgo.GZIP: raise ValueError(f"unrecognized compress algorithm: {body.algo}") inner = self.resolve(body.inner) return gzip.decompress(inner) def _dereference_content_hash(self, body: ContentHash) -> bytes: """Resolves a ContentHash CDR. This method must resolve its inner CDR. If the cache_location has been set: * the content will be read from disk if the content exists within the cache, rather than resolving the HTTP reference. * the content will be saved to the cache after resolving the HTTP reference. * the cache uses the hex string of the content hash as its key (file name). """ def _deref_inner() -> bytes: if body.algo != HashAlgo.BLAKE2b_256: raise ValueError(f"unrecognized hash algorithm: {body.algo}") inner = self.resolve(body.inner) inner_hash = blake2b(inner, digest_size=32).digest() if not compare_digest(inner_hash, body.hash): raise ValueError( f"content failed hash check. HAVE: {inner_hash} WANT: {body.hash}" ) return inner if not self.cache: return _deref_inner() chunk_file = self.cache.joinpath(self._chunk_name(body)) if chunk_file.exists(): return chunk_file.read_bytes() if not chunk_file.exists() and self.fetch_missing_chunks: content = _deref_inner() chunk_file.write_bytes(content) return content raise FileNotFoundError(f"chunk missing from cache: {chunk_file}") def _dereference_size_limits(self, body: SizeLimits) -> bytes: """Resolves a SizeLimits CDR. This method must resolve its inner CDR.""" inner = self.resolve(body.inner) if body.min and len(inner) < body.min: raise ValueError( f"content failed minimum size check. " f"HAVE: {len(inner)} bytes " f"WANT: {body.min} bytes " ) if body.max and len(inner) > body.max: raise ValueError( f"content failed minimum size check. " f"HAVE: {len(inner)} bytes " f"WANT: {body.max} bytes " ) return inner @staticmethod def _chunk_name(content_hash: ContentHash) -> str: algorith = HashAlgo(content_hash.algo) return f"{algorith.name.lower()}_{content_hash.hash.hex()}"
Methods
def resolve(self, ref: Ref) ‑> bytes
-
Resolve a CDR reference.
Expand source code
def resolve(self, ref: Ref) -> bytes: """Resolve a CDR reference.""" field, body = which_one_of(ref, "body") if isinstance(body, Http): return self._dereference_http(body) elif isinstance(body, Cipher): return self._dereference_cipher(body) elif isinstance(body, Compress): return self._dereference_compress(body) elif isinstance(body, ContentHash): return self._dereference_content_hash(body) elif isinstance(body, SizeLimits): return self._dereference_size_limits(body) elif isinstance(body, Concat): return b"".join(map(self.resolve, body.refs)) elif isinstance(body, Slice): return self.resolve(body.inner)[body.start : body.end] else: raise ValueError(f"unsupported Ref variant: {body}")