Source code for aitoolbox.cloud.AWS.data_access

from abc import ABC, abstractmethod
import boto3
import botocore
import os
import shutil


[docs]class BaseDataSaver: def __init__(self, bucket_name='model-result'): """Base class implementing S3 file saving logic Args: bucket_name (str): S3 bucket into which the files will be saved """ self.bucket_name = bucket_name self.s3_client = boto3.client('s3')
[docs] def save_file(self, local_file_path, cloud_file_path): """Save / upload file on local drive to the AWS S3 Args: local_file_path (str): path to the file on the local drive cloud_file_path (str): destination where the file will be saved on S3 inside the specified bucket Returns: None """ self.s3_client.upload_file(os.path.expanduser(local_file_path), self.bucket_name, cloud_file_path)
[docs] def save_folder(self, local_folder_path, cloud_folder_path): """Save / upload the contents of the local folder on the local drive to AWS S3 This function uploads the *contents inside* the provided local folder. If the encapsulating folder should also be created on the S3, specify the folder name at the end of the ``cloud_folder_path``. For example if: ``local_folder_path = '~/bla/my_folder'`` and we want to have the content of *my_folder* also placed into the folder *my_folder on S3* then append *my_folder* at the end of the cloud_folder_path: ``cloud_folder_path = 'cloud_bla/my_folder'`` Args: local_folder_path (str): local path to the folder which should be uploaded cloud_folder_path (str): destination path on S3 where the folder and its content should be uploaded Returns: None """ for root, dirs, files in os.walk(local_folder_path): for filename in files: local_file_path = os.path.join(root, filename) file_path_inside_folder = os.path.relpath(local_file_path, local_folder_path) s3_file_path = os.path.join(cloud_folder_path, file_path_inside_folder) self.s3_client.upload_file(local_file_path, self.bucket_name, s3_file_path)
[docs]class BaseDataLoader: def __init__(self, bucket_name='dataset-store', local_base_data_folder_path='~/project/data'): """Base class implementing S3 file downloading logic Args: bucket_name (str): S3 bucket from which the files will be downloaded local_base_data_folder_path (str): local main experiment saving folder """ self.bucket_name = bucket_name self.s3 = boto3.resource('s3') self.local_base_data_folder_path = os.path.expanduser(local_base_data_folder_path) self.available_prepocessed_datasets = []
[docs] def load_file(self, cloud_file_path, local_file_path): """Download the file AWS S3 to the local drive Args: cloud_file_path (str): location where the file is saved on S3 inside the specified bucket local_file_path (str): destination path where the file will be downloaded to the local drive Returns: None """ local_file_path = os.path.expanduser(local_file_path) if os.path.isfile(local_file_path): print('File already exists on local disk. Not downloading from S3') else: print('Local file does not exist on the local disk. Downloading from S3') try: self.s3.Bucket(self.bucket_name).download_file(cloud_file_path, local_file_path) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": print("The object does not exist on S3.") else: raise
[docs] def exists_local_data_folder(self, data_folder_name, protect_local_folder=True): """Check if a specific folder exists in the base data folder For example, Squad dataset folder inside /data folder, or pretrained_models folder inside /model_results folder Args: data_folder_name (str): protect_local_folder (bool): Returns: bool: """ if os.path.exists(os.path.join(self.local_base_data_folder_path, data_folder_name)) and protect_local_folder: print('Local folder exists and protect_local_folder in True: leaving local folder as it is.') return True if os.path.exists(os.path.join(self.local_base_data_folder_path, data_folder_name)) and not protect_local_folder: print('Local folder exists and protect_local_folder in False: deleting local folder copy') shutil.rmtree(os.path.join(self.local_base_data_folder_path, data_folder_name)) os.mkdir(os.path.join(self.local_base_data_folder_path, data_folder_name)) return False
[docs] def preproc_dataset_available(self, preproc_dataset_name): return preproc_dataset_name in self.available_prepocessed_datasets
[docs]class AbstractDatasetFetcher(ABC):
[docs] @abstractmethod def fetch_dataset(self, dataset_name=None, protect_local_folder=True): """ Args: dataset_name (str or None): protect_local_folder (bool): Returns: None """ pass