Browse Source

Added tooling for check->compress->upload workflow

add-file-preservation
Drew Short 4 years ago
parent
commit
1e222f3c06
  1. 65
      README.md
  2. 119
      acm.py

65
README.md

@ -69,12 +69,12 @@ $ ./acm.py list -x <bucket>
List all files while adding a prefix and stripping a suffix
```bash
$ ./acm.py -p <prefix> list -x <bucket> --suffix <suffix>
$ ./acm.py --remove-prefix <prefix> list -x <bucket> --suffix <suffix>
```
List all files with sha256sum compatible output
```bash
$ ./acm.py --prefix "/tmp/" --stdin list -x testing --suffix .json --sha256sum
$ ./acm.py --remove-prefix "/tmp/" --stdin list -x testing --suffix .json --sha256sum
```
Print out a sha256sum compatible check list
@ -85,12 +85,12 @@ Do a comparison of the remote bucket for files with a matching sha256sum value.
Process a list of files
```bash
$ ./acm.py -p <prefix to strip> match -x <bucket> FILES...
$ ./acm.py --remove-prefix <prefix to strip> match -x <bucket> FILES...
```
Process a list from stdin
```bash
$ find /tmp -name '*.jpg' | ./acm.py -p <prefix to strip> match -x <bucket>
$ find /tmp -name '*.jpg' | ./acm.py --remove-prefix <prefix to strip> match -x <bucket>
```
### Checking For Changes
@ -99,12 +99,12 @@ Do a comparison of the remote bucket for missing files or files with a mismatch
Process a list of files
```bash
$ ./acm.py -p <prefix to strip> check -x <bucket> FILES...
$ ./acm.py --remove-prefix <prefix to strip> check -x <bucket> FILES...
```
Process a list from stdin
```bash
$ find /tmp -name '*.jpg' | ./acm.py -p <prefix to strip> check -x <bucket>
$ find /tmp -name '*.jpg' | ./acm.py --remove-prefix <prefix to strip> check -x <bucket>
```
### Updating Metadata For Changed Files
@ -113,12 +113,12 @@ Update the remote bucket with new metadata for the listed files. Calculates new
Process a list of files
```bash
$ ./acm.py -p <prefix to strip> update -x <bucket> FILES...
$ ./acm.py --remove-prefix <prefix to strip> update -x <bucket> FILES...
```
Process a list from stdin
```bash
$ find /tmp -name '*.jpg' | ./acm.py -p <prefix to strip> update -x <bucket>
$ find /tmp -name '*.jpg' | ./acm.py --remove-prefix <prefix to strip> update -x <bucket>
```
### Storing Files
@ -127,12 +127,12 @@ Store the listed files in `<bucket>-data`.
Process a list of files
```bash
$ ./acm.py -p <prefix to strip> store -x <bucket> FILES...
$ ./acm.py --remove-prefix <prefix to strip> store -x <bucket> FILES...
```
Process a list from stdin
```bash
$ find /tmp -name '*.jpg' | ./acm.py -p <prefix to strip> store -x <bucket>
$ find /tmp -name '*.jpg' | ./acm.py --remove-prefix <prefix to strip> store -x <bucket>
```
### Retrieving Files
@ -141,18 +141,59 @@ Retrieve remote files matching listed files. Optionally place the downloaded fil
Process a list of files
```bash
$ ./acm.py -p <prefix to strip> retrieve -x <bucket> [-d <destination>] FILES...
$ ./acm.py --remove-prefix <prefix to strip> retrieve -x <bucket> [-d <destination>] FILES...
```
Process a list from stdin
```bash
$ find /tmp -name '*.jpg' | ./acm.py -p <prefix to strip> retrieve -x <bucket> [-d <destination>]
$ find /tmp -name '*.jpg' | ./acm.py --remove-prefix <prefix to strip> retrieve -x <bucket> [-d <destination>]
```
### Configuring Profiles
### Compressing Changed Assets
Compressing assets based on profiles
```bash
```
## Usage as a workflow
Compressing changed files and storing them for later use
1. Identify changed files
- `find <file_location> -type f | ./acm.py --stdin -p <file_location> check -x aggressive`
1. Pass identified files to the compressor with a specific profile
- `<found_files> | ./acm.py --stdin -p <file_location> compress -p aggressive -d /tmp/profile-aggressive/`
1. Store compressed assets in data bucket
- `<compressed_files> | ./acm.py --stdin -p /tmp/profile/aggressive/ store -x aggressive`
1. Update metadata about files
- `<found_files> | ./acm.py --stdin -p <file_location> update -x aggressive`
As a combined workflow to only compress and store changed assets
```bash
export TARGET_DIRECTORY=/mnt/e/data/files/
export CONTEXT=android
export PROFILE=aggressive
export PROFILE_TMP_DIR="/tmp/profile-${PROFILE}/"
export COMPRESSED_FILES="$(mktemp)"
# Compress changed assets
find "$TARGET_DIRECTORY" -type f | ./acm.py --stdin --remove-prefix "$TARGET_DIRECTORY" check -x "$CONTEXT" | ./acm.py --stdin --remove-prefix "$TARGET_DIRECTORY" compress --print-input-and-identity -p "$PROFILE" -d "$PROFILE_TMP_DIR" > "$COMPRESSED_FILES"
# Store compressed assets
find "$PROFILE_TMP_DIR" -type f | ./acm.py --stdin --remove-prefix "$PROFILE_TMP_DIR" --add-prefix "$TARGET_DIRECTORY" store -x "${CONTEXT}-data" > /dev/null
# Update Asset manifests
cat "$COMPRESSED_FILES" | ./acm.py --stdin --remove-prefix "$TARGET_DIRECTORY" update -x "$CONTEXT" --input-and-identity > /dev/null
```
As a combined workflow to download matching assets
```bash
TARGET_DIRECTORY=/mnt/e/data/files/
CONTEXT=android
PROFILE=aggressive
PROFILE_TMP_DIR="/tmp/profile-${PROFILE}"
find "$TARGET_DIRECTORY" -type f | ./acm.py --stdin --remove-prefix "$TARGET_DIRECTORY" match -x "$CONTEXT" --print-identity | ./acm.py --stdin retrieve -x "${CONTEXT}-data" -d "$PROFILE_TMP_DIR" > /dev/null`
```
## Contributing
## License

119
acm.py

@ -5,6 +5,7 @@ import io
import json
import os
import platform
import sys
import tempfile
from typing import List, Dict, Callable
@ -12,12 +13,6 @@ import click
from minio import Minio, ResponseError
from minio.error import NoSuchKey
# MinIO Metadata Prefix
METADATA_PREFIX = 'X-Amz-Meta-'
# Metadata Constants
METADATA_SHA256SUM = "Sha256sum"
# Size of the buffer to read files with
BUF_SIZE = 4096
@ -118,8 +113,8 @@ def strip_prefix(prefix: str, file: str) -> str:
def get_file_identity(ctx_obj, file):
if 'PREFIX' in ctx_obj and ctx_obj['PREFIX'] is not None:
path = strip_prefix(ctx_obj['PREFIX'], file)
if 'REMOVE_PREFIX' in ctx_obj and ctx_obj['REMOVE_PREFIX'] is not None:
path = strip_prefix(ctx_obj['REMOVE_PREFIX'], file)
else:
path = file
@ -159,15 +154,15 @@ def prep_s3(ctx):
return s3_bucket, s3
def get_file_sha256sum(s3, s3_bucket, file_identity, file):
file_object = s3.stat_object(s3_bucket, file_identity)
stored_file_hash = file_object.metadata[get_metadata_name("SHA256SUM")]
def get_file_sha256sum(stored_data, file):
stored_file_hash = stored_data['sha256sum']
stored_profile_hash = stored_data['profilesHash']
sha256sum = hashlib.sha256()
with open(file, 'rb') as f:
for byte_block in iter(lambda: f.read(BUF_SIZE), b""):
sha256sum.update(byte_block)
calculated_file_hash = sha256sum.hexdigest()
return stored_file_hash, calculated_file_hash
return stored_profile_hash, stored_file_hash, calculated_file_hash
def load_config(path: str) -> any:
@ -184,6 +179,14 @@ def load_config(path: str) -> any:
else:
config['concurrency'] = 0
# Calculate profiles hash
sha256sum = hashlib.sha256()
with io.BytesIO(json.dumps(config['profiles']).encode('utf-8')) as c:
for byte_block in iter(lambda: c.read(BUF_SIZE), b''):
sha256sum.update(byte_block)
profiles_hash = sha256sum.hexdigest()
config['profilesHash'] = profiles_hash
return config
@ -191,14 +194,16 @@ def load_config(path: str) -> any:
@click.option('-d', '--debug/--no-debug', default=False)
@click.option('-c', '--config', default=lambda: os.path.join(os.getcwd(), 'acm-config.json'), show_default=True)
@click.option('-s', '--stdin/--no-stdin', default=False)
@click.option('-p', '--prefix', default=None)
@click.option('--remove-prefix', default=None)
@click.option('--add-prefix', default=None)
@click.pass_context
def cli(ctx, debug, config, stdin, prefix):
def cli(ctx, debug, config, stdin, remove_prefix, add_prefix):
ctx.ensure_object(dict)
ctx.obj['DEBUG'] = debug
ctx.obj['CONFIG'] = load_config(config)
ctx.obj['READ_STDIN'] = stdin
ctx.obj['PREFIX'] = prefix
ctx.obj['REMOVE_PREFIX'] = remove_prefix
ctx.obj['ADD_PREFIX'] = add_prefix
###############################
@ -232,8 +237,8 @@ def list_files(ctx, context, sha256sum, suffix):
for obj in found_objects:
file = obj
if 'PREFIX' in ctx.obj and ctx.obj['PREFIX'] is not None:
file = os.path.join(ctx.obj['PREFIX'], file)
if 'REMOVE_PREFIX' in ctx.obj and ctx.obj['REMOVE_PREFIX'] is not None:
file = os.path.join(ctx.obj['REMOVE_PREFIX'], file)
if suffix is not None and suffix in file:
file = file.replace(suffix, '')
@ -252,9 +257,13 @@ def list_files(ctx, context, sha256sum, suffix):
@cli.command(name="match")
@click.option('-x', '--context', required=True)
@click.option('--print-identity/--no-print-identity', default=False)
@click.argument('files', nargs=-1)
@click.pass_context
def check_matched_files_hashes(ctx, context, files):
def check_matched_files_hashes(ctx, context, print_identity, files):
"""
List all files that have matching stored sha256sum and profilesHash
"""
ctx.obj['CONTEXT'] = context
s3_bucket, s3 = prep_s3(ctx)
matching_files: List[str] = []
@ -265,9 +274,15 @@ def check_matched_files_hashes(ctx, context, files):
for file in files:
file_identity = f'{get_file_identity(ctx.obj, file)}.json'
try:
stored_file_hash, calculated_file_hash = get_file_sha256sum(s3, s3_bucket, file_identity, file)
if calculated_file_hash == stored_file_hash:
matching_files.append(file)
file_object = s3.get_object(s3_bucket, file_identity)
stored_data = json.load(file_object)
stored_profile_hash, stored_file_hash, calculated_file_hash = get_file_sha256sum(stored_data, file)
if calculated_file_hash == stored_file_hash \
and ctx.obj['CONFIG']['profilesHash'] == stored_profile_hash:
if print_identity:
matching_files.append(stored_data['storedAssetIdentity'])
else:
matching_files.append(file)
except NoSuchKey as e:
continue
except ValueError or ResponseError as e:
@ -281,6 +296,9 @@ def check_matched_files_hashes(ctx, context, files):
@click.argument('files', nargs=-1)
@click.pass_context
def check_changed_files_hashes(ctx, context, files):
"""
List all files that do not have a matching sha256sum or profilesHash
"""
ctx.obj['CONTEXT'] = context
s3_bucket, s3 = prep_s3(ctx)
changed_files: List[str] = []
@ -291,8 +309,11 @@ def check_changed_files_hashes(ctx, context, files):
for file in files:
file_identity = f'{get_file_identity(ctx.obj, file)}.json'
try:
stored_file_hash, calculated_file_hash = get_file_sha256sum(s3, s3_bucket, file_identity, file)
if calculated_file_hash != stored_file_hash:
file_object = s3.get_object(s3_bucket, file_identity)
stored_data = json.load(file_object)
stored_profile_hash, stored_file_hash, calculated_file_hash = get_file_sha256sum(stored_data, file)
if calculated_file_hash != stored_file_hash \
or ctx.obj['CONFIG']['profilesHash'] != stored_profile_hash:
changed_files.append(file)
except NoSuchKey as e:
changed_files.append(file)
@ -304,9 +325,13 @@ def check_changed_files_hashes(ctx, context, files):
@cli.command(name="update")
@click.option('-x', '--context', required=True)
@click.option('--input-and-identity/--no-input-and-identity', default=False)
@click.argument('files', nargs=-1)
@click.pass_context
def update_changed_files_hashes(ctx, context, files):
def update_changed_files_hashes(ctx, context, input_and_identity, files):
"""
Store new data objects for the provided files
"""
ctx.obj['CONTEXT'] = context
s3_bucket, s3 = prep_s3(ctx)
updated_files: List[str] = []
@ -315,6 +340,9 @@ def update_changed_files_hashes(ctx, context, files):
files = get_clean_stdin_iterator(click.get_text_stream('stdin'))
for file in files:
identity = None
if input_and_identity:
file, identity = file.split('\t')
file_identity = f'{get_file_identity(ctx.obj, file)}.json'
try:
sha256sum = hashlib.sha256()
@ -322,9 +350,15 @@ def update_changed_files_hashes(ctx, context, files):
for byte_block in iter(lambda: f.read(BUF_SIZE), b''):
sha256sum.update(byte_block)
calculated_file_hash = sha256sum.hexdigest()
object_data = {
"path": file
"sourcePath": file,
"storedAssetIdentity": identity,
"identity": file_identity,
"sha256sum": calculated_file_hash,
"profilesHash": ctx.obj['CONFIG']['profilesHash']
}
with io.BytesIO(json.dumps(object_data, sort_keys=True, indent=None).encode('utf-8')) as data:
data.seek(0, os.SEEK_END)
data_length = data.tell()
@ -335,9 +369,7 @@ def update_changed_files_hashes(ctx, context, files):
data,
data_length,
content_type="application/json",
metadata={
"SHA256SUM": calculated_file_hash
}
metadata={}
)
updated_files.append(file)
except ValueError or ResponseError as e:
@ -351,6 +383,9 @@ def update_changed_files_hashes(ctx, context, files):
@click.argument('files', nargs=-1)
@click.pass_context
def store_files(ctx, context, files):
"""
Store specified files in a <context> bucket for retrieval.
"""
ctx.obj['CONTEXT'] = context
s3_bucket, s3 = prep_s3(ctx)
stored_files: List[str] = []
@ -367,9 +402,12 @@ def store_files(ctx, context, files):
file,
content_type="application/octet-stream"
)
stored_files.append(file)
if 'ADD_PREFIX' in ctx.obj and ctx.obj['ADD_PREFIX'] is not None:
stored_files.append(os.path.join(ctx.obj['ADD_PREFIX'], file_identity))
else:
stored_files.append(file)
except ResponseError as e:
print(f'ERROR: {file} {e}')
print(f'ERROR: {file} {e}', file=sys.stderr)
print(os.linesep.join(stored_files))
@ -380,6 +418,9 @@ def store_files(ctx, context, files):
@click.argument('files', nargs=-1)
@click.pass_context
def retrieve_files(ctx, context, destination, files):
"""
Retrieve specified files from a <context> bucket
"""
ctx.obj['CONTEXT'] = context
s3_bucket, s3 = prep_s3(ctx)
retrieved_files: List[str] = []
@ -399,8 +440,10 @@ def retrieve_files(ctx, context, destination, files):
file_destination
)
retrieved_files.append(file_destination)
except NoSuchKey as e:
print(f'ERROR: {file_identity} {file_destination} {e}', file=sys.stderr)
except ResponseError as e:
print(f'ERROR: {file_destination} {e}')
print(f'ERROR: {file_destination} {e}', file=sys.stderr)
print(os.linesep.join(retrieved_files))
@ -414,9 +457,10 @@ def retrieve_files(ctx, context, destination, files):
@click.option('-p', '--profile', default='default')
@click.option('-c', '--content', default='all')
@click.option('-d', '--destination', default=None)
@click.option('--print-input-and-identity/--no-print-input-and-identity', default=False)
@click.argument('files', nargs=-1)
@click.pass_context
def compress_assets(ctx, profile, content, destination, files):
def compress_assets(ctx, profile, content, destination, print_input_and_identity, files):
profiles = ctx.obj['CONFIG']['profiles']
if profile not in profiles:
@ -471,8 +515,8 @@ def compress_assets(ctx, profile, content, destination, files):
for content_configuration in content_configurations:
if any([input_file.endswith(extension) for extension in content_configuration['extensions']]):
file = input_file
if 'PREFIX' in ctx.obj and ctx.obj['PREFIX'] is not None:
file = strip_prefix(ctx.obj['PREFIX'], input_file)
if 'REMOVE_PREFIX' in ctx.obj and ctx.obj['REMOVE_PREFIX'] is not None:
file = strip_prefix(ctx.obj['REMOVE_PREFIX'], input_file)
if 'preserveInputExtension' in content_configuration \
and content_configuration['preserveInputExtension']:
@ -481,6 +525,8 @@ def compress_assets(ctx, profile, content, destination, files):
output_file_without_ext = os.path.splitext(os.path.join(destination, file))[0]
output_file = f'{output_file_without_ext}.{content_configuration["outputExtension"]}'
output_file_identity = get_file_identity({'REMOVE_PREFIX': destination}, output_file)
output_file_dir = os.path.dirname(output_file)
os.makedirs(output_file_dir, exist_ok=True)
@ -493,7 +539,10 @@ def compress_assets(ctx, profile, content, destination, files):
command,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.DEVNULL,
on_success=store_filename(compressed_files, output_file)
on_success=store_filename(
compressed_files,
f'{input_file}\t{output_file_identity}' if print_input_and_identity else output_file
)
)
)

Loading…
Cancel
Save