Datablocks

`calculate_checksum(filename, chunksize=1024 * 1025)`

Calculate an md5 hash of a file

Parameters:

Name	Type	Description	Default
`filename`	`Path`	absolute or relative path to file	required
`chunksize`	`int`	default chunk size to calculate hash on. Defaults to 1024*1025.	`1024 * 1025`

Returns:

Name	Type	Description
`str`	`str`	hash as str

Source code in backend/archiver/utils/datablocks.py

def calculate_checksum(filename: Path, chunksize: int = 1024 * 1025) -> str:
    """Calculate an md5 hash of a file

    Args:
        filename (Path): absolute or relative path to file
        chunksize (int, optional): default chunk size to calculate hash on. Defaults to 1024*1025.

    Returns:
        str: hash as str
    """
    import hashlib
    m = hashlib.md5()
    with open(filename, 'rb') as f:
        while chunk := f.read(chunksize):
            m.update(chunk)
    return m.hexdigest()

`create_datablock_entries(dataset_id, folder, origDataBlocks, tarballs)`

Create datablock entries compliant with schema provided by scicat

Parameters:

Name	Type	Description	Default
`dataset_id`	`int`	Dataset identifier	required
`folder`	`Path`	description	required
`origDataBlocks`	`List[OrigDataBlock]`	description	required
`tarballs`	`List[Path]`	description	required

Returns:

Type	Description
`List[DataBlock]`	List[DataBlock]: description

Source code in backend/archiver/utils/datablocks.py

def create_datablock_entries(
        dataset_id: int, folder: Path, origDataBlocks: List[OrigDataBlock],
        tarballs: List[Path]) -> List[DataBlock]:
    """Create datablock entries compliant with schema provided by scicat

    Args:
        dataset_id (int): Dataset identifier
        folder (Path): _description_
        origDataBlocks (List[OrigDataBlock]): _description_
        tarballs (List[Path]): _description_

    Returns:
        List[DataBlock]: _description_
    """
    datablocks: List[DataBlock] = []
    for tar in tarballs:
        o = origDataBlocks[0]

        data_file_list: List[DataFile] = []

        tar_path = folder / tar

        md5_hash = calculate_checksum(tar_path)

        tarball = tarfile.open(tar_path)

        for tar_info in tarball.getmembers():
            data_file_list.append(DataFile(
                path=tar_info.path,
                size=tar_info.size,
                # time=tar_info.mtime
                chk=str(md5_hash),
                uid=str(tar_info.uid),
                gid=str(tar_info.gid),
                perm=str(tar_info.mode),
                createdBy=str(tar_info.uname),
                updatedBy=str(tar_info.uname),
                # createdAt=tar_info.mtime,
                # updatedAt=tar_info.mtime
            ))

        datablocks.append(DataBlock(
            id=str(uuid4()),
            archiveId=str(StoragePaths.relative_datablocks_folder(dataset_id) / tar_path.name),
            size=1,
            packedSize=1,
            chkAlg="md5",
            version=str(1),
            ownerGroup=o.ownerGroup,
            accessGroups=o.accessGroups,
            instrumentGroup=o.instrumentGroup,
            # createdBy=
            # updatedBy=
            # updatedAt=datetime.datetime.isoformat(),
            datasetId=str(dataset_id),
            dataFileList=data_file_list,
            rawDatasetId=o.rawdatasetId,
            derivedDatasetId=o.derivedDatasetId
        ))

    return datablocks

`create_tarballs(dataset_id, folder, target_size=300 * 1024 ** 2)`

summary

Parameters:

Name	Type	Description	Default
`dataset_id`	`int`	description	required
`folder`	`Path`	description	required
`target_size`	`int`	description. Defaults to 300(1024*2).	`300 * 1024 ** 2`

Returns:

Type	Description
`List[Path]`	List[Path]: description

Source code in backend/archiver/utils/datablocks.py

def create_tarballs(dataset_id: int, folder: Path,
                    target_size: int = 300 * (1024**2)) -> List[Path]:
    """_summary_

    Args:
        dataset_id (int): _description_
        folder (Path): _description_
        target_size (int, optional): _description_. Defaults to 300*(1024**2).

    Returns:
        List[Path]: _description_
    """

    # TODO: corner case: target size < file size
    tarballs: List[Path] = []

    filename: Path = Path(f"{dataset_id}_{len(tarballs)}.tar.gz")
    filepath = folder / filename

    tar = tarfile.open(filepath, 'x:gz', compresslevel=4)

    for f in folder.iterdir():
        file = f
        if file.suffix == ".gz":
            continue
        tar.add(file, recursive=False)

        if filepath.stat().st_size >= target_size:
            tar.close()
            tarballs.append(filename)
            filename = Path(f"{dataset_id}_{len(tarballs)}.tar.gz")
            filepath = folder / filename
            tar = tarfile.open(filepath, 'w')

    tar.close()
    tarballs.append(filename)

    return tarballs

`download_object_from_s3(bucket, folder, object_name, target_path)`

Download an object from S3 storage.

Parameters:

Name	Type	Description	Default
`bucket`	`Bucket`	Bucket to look for file	required
`folder`	`Path`	s3 prefix for object	required
`object_name`	`str`	object name, no prefix	required
`target_path`	`Path`	absolute or relative path for the file to be created	required

Source code in backend/archiver/utils/datablocks.py

def download_object_from_s3(bucket: Bucket, folder: Path, object_name: str, target_path: Path):
    """Download an object from S3 storage.

    Args:
        bucket (Bucket): Bucket to look for file
        folder (Path): s3 prefix for object
        object_name (str): object name, no prefix
        target_path (Path): absolute or relative path for the file to be created
    """
    S3Storage().fget_object(bucket=bucket, folder=str(folder), object_name=object_name, target_path=target_path)

`download_objects_from_s3(prefix, bucket, destination_folder)`

Download objects form s3 storage to folder

Parameters:

Name	Type	Description	Default
`prefix`	`Path`	S3 prefix	required
`bucket`	`Bucket`	s3 bucket	required
`destination_folder`	`Path`	Target folder. Will be created if it does not exist.	required

Returns:

Type	Description
`List[Path]`	List[Path]: List of paths of created files

Source code in backend/archiver/utils/datablocks.py

def download_objects_from_s3(prefix: Path, bucket: Bucket, destination_folder: Path) -> List[Path]:
    """Download objects form s3 storage to folder

    Args:
        prefix (Path): S3 prefix
        bucket (Bucket): s3 bucket
        destination_folder (Path): Target folder. Will be created if it does not exist.

    Returns:
        List[Path]: List of paths of created files
    """
    destination_folder.mkdir(parents=True, exist_ok=True)

    files: List[Path] = []

    for item in S3Storage().list_objects(bucket, str(prefix)):
        local_filepath = destination_folder / Path(item.object_name or "")
        local_filepath.parent.mkdir(parents=True, exist_ok=True)
        S3Storage().fget_object(bucket=bucket, folder=str(prefix), object_name=item.object_name or "", target_path=local_filepath)
        files.append(local_filepath)

    return files

`list_s3_objects(prefix, bucket)`

List all objects in s3 bucket and path

Parameters:

Name	Type	Description	Default
`minio_prefix`	`Path`	prefix for files to be listed	required
`bucket`	`Bucket`	s3 bucket	required

Returns:

Name	Type	Description
`_type_`	`Iterator[object]`	Iterator to objects

Source code in backend/archiver/utils/datablocks.py

def list_s3_objects(prefix: Path, bucket: Bucket) -> Iterator[object]:
    """List all objects in s3 bucket and path

    Args:
        minio_prefix (Path): prefix for files to be listed
        bucket (Bucket): s3 bucket

    Returns:
        _type_: Iterator to objects
    """
    getLogger().debug(f"Minio: {S3Storage().url}")
    return S3Storage().list_objects(bucket, str(prefix))