Study Notes
Most common:
- Azure Storage (blob and file containers)
- Azure Data Lake stores
- Azure SQL Database
- Azure Databricks file system (DBFS)
- Azure Storage (blob and file containers)
- Azure Storage file container
# Get all datastores in a workspace
from azureml.core import Workspace, Datastore, Dataset
ws = Workspace.from_config()
for ds_name in ws.datastores:
print(ds_name)
azureml_globaldatasets
workspaceworkingdirectory
workspaceartifactstore
workspacefilestore
workspaceblobstore
# Get default datastore
default_store = ws.get_default_datastore()
print(default_store )
Result:default_store = ws.get_default_datastore()
print(default_store )
{
"name": "workspaceblobstore",
"container_name": "azureml-blobstore-f2b9ce82-2767-48c8-9f4a-xxxxxxxxxxxxxxxx",
"account_name": "vsmlwsstorageb73bnnnnnnn",
"protocol": "https",
"endpoint": "core.windows.net"
}
Datasets:
- Tabular:
Data is read from the dataset as a table. Use it when working with Pandas dataframes.
# create and register
from azureml.core import Dataset
blob_ds = ws.get_default_datastore()
csv_paths = [(blob_ds, 'data/files/file1.csv'),
(blob_ds, 'data/files/archive/*.csv')]
# Create
tab_ds = Dataset.Tabular.from_delimited_files(path=csv_paths)
# Register
tab_ds = tab_ds.register(workspace=ws, name='table_dataset_name') - File:
The dataset presents a list of file paths that can be read as though from the file system. Use it for ubstructured data or images.
# create and register
from azureml.core import Dataset
blob_ds = ws.get_default_datastore()
# Create
file_ds = Dataset.File.from_files(path=(blob_ds, 'data/files/images/*.jpg'))
# Register
file_ds = file_ds.register(workspace=ws, name='files_dataset_name')
In experiments is a very common to change the input data.
How it works:
Experiment/Job (model) is created and sealed.
New data must be input to get predictions, to train or keep updated the model (data drift).
All we need is to set new data source and feed the model.
Data sources can be sent as a parameter to the experimentscript.
Data source can be sent as:
- argument (id of dataset / reference of files location)
- named input (name of dataset/name of files location)
1. Tabular dataset - pass a tabular dataset as argument to an experiment script
ScriptRunConfig
# Create & register Environment
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', tab_ds],
environment=env)
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', tab_ds],
environment=env)
script.py (it runs the experiment)
from azureml.core import Run, Dataset
parser.add_argument('--ds', type=str, dest='dataset_id')
args = parser.parse_args()
run = Run.get_context()
ws = run.experiment.workspace
dataset = Dataset.get_by_id(ws, id=args.dataset_id)
data = dataset.to_pandas_dataframe()
2.Tabular dataset - pass a tabular dataset as named input to an experiment script
ScriptRunConfig
# Create & register Environment
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', tab_ds.as_named_input(MY_DATA_SET_NAME)],
environment=env)
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', tab_ds.as_named_input(MY_DATA_SET_NAME)],
environment=env)
script.py (it runs the experiment)
from azureml.core import Run, Dataset
parser.add_argument('--ds', type=str, dest='dataset_id', help='MY_DATA_SET_NAME')
args = parser.parse_args()
run = Run.get_context()
dataset = run.input_dataset['MY_DATA_SET_NAME']
data = dataset.to_pandas_dataframe()
3. Filedataset - pass a tabular dataset as argument to an experiment script
ScriptRunConfig
# Create & register Environment
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', file_ds.as_download()], #can be as_mount()if not possoble/not wanted to download
environment=env)
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', file_ds.as_download()], #can be as_mount()if not possoble/not wanted to download
environment=env)
script.py (it runs the experiment)
from azureml.core import Run, Dataset
parser.add_argument('--ds', type=str, dest='ds_ref')
args = parser.parse_args()
run = Run.get_context()
imgs = glob.glob(args.ds_ref + "/*.jpg")
4.File dataset - pass a tabular dataset as named input to an experiment script
ScriptRunConfig
# Create & register Environment
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', file_ds.as_named_input('MY_DATASET_NAME').as_download()],# can be as_mount() if not possoble/not wanted to download
environment=env)
env = Environment('my_env')
packages = CondaDependencies.create(conda_packages=['pip'],
pip_packages=['azureml-defaults',
'azureml-dataprep[pandas]'])
env.python.conda_dependencies = packages
script_config = ScriptRunConfig(source_directory='my_dir',
script='script.py',
arguments=['--ds', file_ds.as_named_input('MY_DATASET_NAME').as_download()],# can be as_mount() if not possoble/not wanted to download
environment=env)
script.py (it runs the experiment)
from azureml.core import Run, Dataset
parser.add_argument('--ds', type=str, dest='ds_ref', help='MY_DATASET_NAME')
args = parser.parse_args()
run = Run.get_context()
dataset = run.input_dataset['MY_DATASET_NAME']
imgs= glob.glob(dataset + "/*.jpg")
Print datasets in a datastore.
from azureml.core import Dataset
from azureml.data.datapath import DataPath
default_ds = ws.get_default_datastore()
from azureml.core import Dataset
blob_ds = ws.get_default_datastore()
for ds_name in ws.datastores:
print(ds_name)
# Get default datastore
default_store = ws.get_default_datastore()
print(default_store )
# Datasests in default datastore
my_datasets = ws.datasets
my_datasets = ws.datasets
print(my_datasets)
Result:
azureml_globaldatasets
workspaceworkingdirectory
workspaceartifactstore
workspacefilestore
workspaceblobstore
{
"name": "workspaceblobstore",
"container_name": "azureml-blobstore-f2b9ce82-2767-48c8-9f4a-74e23f6a20ec",
"account_name": "vsmlwsstorageb73b5875214",
"protocol": "https",
"endpoint": "core.windows.net"
}
{ 'diabetes dataset': DatasetRegistration(id='69b60b5f-47b3-433f-bc0d-ae7c0a0be3a0', name='diabetes dataset', version=1, description='diabetes data', tags={'format': 'CSV'}),
'diabetes file dataset': DatasetRegistration(id='e86078b9-f3a4-4d76-b88c-09cc05c2564b', name='diabetes file dataset', version=1, description='diabetes files', tags={'format': 'CSV'})}
Dataset already registered.
References:
Use datasets - Training | Microsoft Learn