Skip to content

Datablocks

calculate_checksum(filename, chunksize=1024 * 1025)

Calculate an md5 hash of a file

Parameters:

Name Type Description Default
filename Path

absolute or relative path to file

required
chunksize int

default chunk size to calculate hash on. Defaults to 1024*1025.

1024 * 1025

Returns:

Name Type Description
str str

hash as str

Source code in backend/archiver/utils/datablocks.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def calculate_checksum(filename: Path, chunksize: int = 1024 * 1025) -> str:
    """Calculate an md5 hash of a file

    Args:
        filename (Path): absolute or relative path to file
        chunksize (int, optional): default chunk size to calculate hash on. Defaults to 1024*1025.

    Returns:
        str: hash as str
    """
    import hashlib
    m = hashlib.md5()
    with open(filename, 'rb') as f:
        while chunk := f.read(chunksize):
            m.update(chunk)
    return m.hexdigest()

create_datablock_entries(dataset_id, folder, origDataBlocks, tarballs)

Create datablock entries compliant with schema provided by scicat

Parameters:

Name Type Description Default
dataset_id int

Dataset identifier

required
folder Path

description

required
origDataBlocks List[OrigDataBlock]

description

required
tarballs List[Path]

description

required

Returns:

Type Description
List[DataBlock]

List[DataBlock]: description

Source code in backend/archiver/utils/datablocks.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def create_datablock_entries(
        dataset_id: int, folder: Path, origDataBlocks: List[OrigDataBlock],
        tarballs: List[Path]) -> List[DataBlock]:
    """Create datablock entries compliant with schema provided by scicat

    Args:
        dataset_id (int): Dataset identifier
        folder (Path): _description_
        origDataBlocks (List[OrigDataBlock]): _description_
        tarballs (List[Path]): _description_

    Returns:
        List[DataBlock]: _description_
    """
    datablocks: List[DataBlock] = []
    for tar in tarballs:
        o = origDataBlocks[0]

        data_file_list: List[DataFile] = []

        tar_path = folder / tar

        md5_hash = calculate_checksum(tar_path)

        tarball = tarfile.open(tar_path)

        for tar_info in tarball.getmembers():
            data_file_list.append(DataFile(
                path=tar_info.path,
                size=tar_info.size,
                # time=tar_info.mtime
                chk=str(md5_hash),
                uid=str(tar_info.uid),
                gid=str(tar_info.gid),
                perm=str(tar_info.mode),
                createdBy=str(tar_info.uname),
                updatedBy=str(tar_info.uname),
                # createdAt=tar_info.mtime,
                # updatedAt=tar_info.mtime
            ))

        datablocks.append(DataBlock(
            id=str(uuid4()),
            archiveId=str(StoragePaths.relative_datablocks_folder(dataset_id) / tar_path.name),
            size=1,
            packedSize=1,
            chkAlg="md5",
            version=str(1),
            ownerGroup=o.ownerGroup,
            accessGroups=o.accessGroups,
            instrumentGroup=o.instrumentGroup,
            # createdBy=
            # updatedBy=
            # updatedAt=datetime.datetime.isoformat(),
            datasetId=str(dataset_id),
            dataFileList=data_file_list,
            rawDatasetId=o.rawdatasetId,
            derivedDatasetId=o.derivedDatasetId
        ))

    return datablocks

create_tarballs(dataset_id, folder, target_size=300 * 1024 ** 2)

summary

Parameters:

Name Type Description Default
dataset_id int

description

required
folder Path

description

required
target_size int

description. Defaults to 300(1024*2).

300 * 1024 ** 2

Returns:

Type Description
List[Path]

List[Path]: description

Source code in backend/archiver/utils/datablocks.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def create_tarballs(dataset_id: int, folder: Path,
                    target_size: int = 300 * (1024**2)) -> List[Path]:
    """_summary_

    Args:
        dataset_id (int): _description_
        folder (Path): _description_
        target_size (int, optional): _description_. Defaults to 300*(1024**2).

    Returns:
        List[Path]: _description_
    """

    # TODO: corner case: target size < file size
    tarballs: List[Path] = []

    filename: Path = Path(f"{dataset_id}_{len(tarballs)}.tar.gz")
    filepath = folder / filename

    tar = tarfile.open(filepath, 'x:gz', compresslevel=4)

    for f in folder.iterdir():
        file = f
        if file.suffix == ".gz":
            continue
        tar.add(file, recursive=False)

        if filepath.stat().st_size >= target_size:
            tar.close()
            tarballs.append(filename)
            filename = Path(f"{dataset_id}_{len(tarballs)}.tar.gz")
            filepath = folder / filename
            tar = tarfile.open(filepath, 'w')

    tar.close()
    tarballs.append(filename)

    return tarballs

download_object_from_s3(bucket, folder, object_name, target_path)

Download an object from S3 storage.

Parameters:

Name Type Description Default
bucket Bucket

Bucket to look for file

required
folder Path

s3 prefix for object

required
object_name str

object name, no prefix

required
target_path Path

absolute or relative path for the file to be created

required
Source code in backend/archiver/utils/datablocks.py
74
75
76
77
78
79
80
81
82
83
def download_object_from_s3(bucket: Bucket, folder: Path, object_name: str, target_path: Path):
    """Download an object from S3 storage.

    Args:
        bucket (Bucket): Bucket to look for file
        folder (Path): s3 prefix for object
        object_name (str): object name, no prefix
        target_path (Path): absolute or relative path for the file to be created
    """
    S3Storage().fget_object(bucket=bucket, folder=str(folder), object_name=object_name, target_path=target_path)

download_objects_from_s3(prefix, bucket, destination_folder)

Download objects form s3 storage to folder

Parameters:

Name Type Description Default
prefix Path

S3 prefix

required
bucket Bucket

s3 bucket

required
destination_folder Path

Target folder. Will be created if it does not exist.

required

Returns:

Type Description
List[Path]

List[Path]: List of paths of created files

Source code in backend/archiver/utils/datablocks.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def download_objects_from_s3(prefix: Path, bucket: Bucket, destination_folder: Path) -> List[Path]:
    """Download objects form s3 storage to folder

    Args:
        prefix (Path): S3 prefix
        bucket (Bucket): s3 bucket
        destination_folder (Path): Target folder. Will be created if it does not exist.

    Returns:
        List[Path]: List of paths of created files
    """
    destination_folder.mkdir(parents=True, exist_ok=True)

    files: List[Path] = []

    for item in S3Storage().list_objects(bucket, str(prefix)):
        local_filepath = destination_folder / Path(item.object_name or "")
        local_filepath.parent.mkdir(parents=True, exist_ok=True)
        S3Storage().fget_object(bucket=bucket, folder=str(prefix), object_name=item.object_name or "", target_path=local_filepath)
        files.append(local_filepath)

    return files

list_s3_objects(prefix, bucket)

List all objects in s3 bucket and path

Parameters:

Name Type Description Default
minio_prefix Path

prefix for files to be listed

required
bucket Bucket

s3 bucket

required

Returns:

Name Type Description
_type_ Iterator[object]

Iterator to objects

Source code in backend/archiver/utils/datablocks.py
86
87
88
89
90
91
92
93
94
95
96
97
def list_s3_objects(prefix: Path, bucket: Bucket) -> Iterator[object]:
    """List all objects in s3 bucket and path

    Args:
        minio_prefix (Path): prefix for files to be listed
        bucket (Bucket): s3 bucket

    Returns:
        _type_: Iterator to objects
    """
    getLogger().debug(f"Minio: {S3Storage().url}")
    return S3Storage().list_objects(bucket, str(prefix))