Tooling for managing asset compression, storage, and retrieval
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

299 lines
8.8 KiB

  1. #!/usr/bin/env python
  2. import hashlib
  3. import io
  4. import json
  5. import os
  6. from typing import List
  7. import click
  8. from minio import Minio, ResponseError
  9. from minio.error import NoSuchKey
  10. # MinIO Metadata Prefix
  11. METADATA_PREFIX = 'X-Amz-Meta-'
  12. # Metadata Constants
  13. METADATA_SHA256SUM = "Sha256sum"
  14. # Size of the buffer to read files with
  15. BUF_SIZE = 4096
  16. def get_metadata_name(key):
  17. return METADATA_PREFIX + 'SHA256SUM'.capitalize()
  18. def get_clean_stdin_iterator(stdin_stream):
  19. return (line.strip() for line in stdin_stream if line.strip() != '')
  20. def get_file_identity(ctx_obj, file):
  21. if 'PREFIX' in ctx_obj and ctx_obj['PREFIX'] is not None:
  22. path = file.replace(ctx_obj['PREFIX'], '')
  23. else:
  24. path = file
  25. if os.pathsep != '/':
  26. path = '/'.join(path.split(os.pathsep))
  27. return path
  28. def list_s3_dir(s3: Minio, bucket: str, prefix: str) -> List[str]:
  29. found_files = []
  30. for obj in s3.list_objects_v2(bucket, prefix=prefix):
  31. if obj.is_dir:
  32. found_files.extend(list_s3_dir(s3, bucket, obj.object_name))
  33. else:
  34. found_files.append(obj.object_name)
  35. return found_files
  36. def get_s3_client(config: any) -> Minio:
  37. host = config['host']
  38. secure = config['secure']
  39. access_key = config['access']
  40. secret_key = config['secret']
  41. return Minio(host, secure=secure, access_key=access_key, secret_key=secret_key)
  42. def prep_s3(ctx):
  43. s3_config = ctx.obj['CONFIG']['s3']
  44. s3_bucket = ctx.obj['CONTEXT']
  45. s3 = get_s3_client(s3_config)
  46. if not s3.bucket_exists(s3_bucket):
  47. s3.make_bucket(s3_bucket)
  48. return s3_bucket, s3
  49. def get_file_sha256sum(s3, s3_bucket, file_identity, file):
  50. file_object = s3.stat_object(s3_bucket, file_identity)
  51. stored_file_hash = file_object.metadata[get_metadata_name("SHA256SUM")]
  52. sha256sum = hashlib.sha256()
  53. with open(file, 'rb') as f:
  54. for byte_block in iter(lambda: f.read(BUF_SIZE), b""):
  55. sha256sum.update(byte_block)
  56. calculated_file_hash = sha256sum.hexdigest()
  57. return stored_file_hash, calculated_file_hash
  58. def load_config(path: str) -> any:
  59. with open(path, 'r') as config_file:
  60. config = json.load(config_file)
  61. # Setup S3 Settings
  62. config['s3']['access'] = os.getenv('ACM_S3_ACCESS')
  63. config['s3']['secret'] = os.getenv('ACM_S3_SECRET')
  64. return config
  65. @click.group()
  66. @click.option('-d', '--debug/--no-debug', default=False)
  67. @click.option('-c', '--config', default=lambda: os.path.join(os.getcwd(), 'acm-config.json'), show_default=True)
  68. @click.option('-x', '--context', required=True)
  69. @click.option('-s', '--stdin/--no-stdin', default=False)
  70. @click.option('-p', '--prefix', default=None)
  71. @click.pass_context
  72. def cli(ctx, debug, config, context, stdin, prefix):
  73. ctx.ensure_object(dict)
  74. ctx.obj['DEBUG'] = debug
  75. ctx.obj['CONFIG'] = load_config(config)
  76. ctx.obj['CONTEXT'] = context
  77. ctx.obj['READ_STDIN'] = stdin
  78. ctx.obj['PREFIX'] = prefix
  79. @cli.command(name="list")
  80. @click.option('--sha256sum/--no-sha256sum', default=False)
  81. @click.option('--suffix', default=None)
  82. @click.pass_context
  83. def list_files(ctx, sha256sum, suffix):
  84. s3_config = ctx.obj['CONFIG']['s3']
  85. s3_bucket = ctx.obj['CONTEXT']
  86. s3 = get_s3_client(s3_config)
  87. if not s3.bucket_exists(s3_bucket):
  88. s3.make_bucket(s3_bucket)
  89. found_files: List[str] = []
  90. found_objects: List[str] = []
  91. for obj in s3.list_objects_v2(s3_bucket, recursive=False):
  92. if obj.is_dir:
  93. found_objects.extend(list_s3_dir(s3, s3_bucket, obj.object_name))
  94. else:
  95. found_objects.append(obj.object_name)
  96. for obj in found_objects:
  97. file = obj
  98. if 'PREFIX' in ctx.obj and ctx.obj['PREFIX'] is not None:
  99. file = os.path.join(ctx.obj['PREFIX'], file)
  100. if suffix is not None and suffix in file:
  101. file = file.replace(suffix, '')
  102. file = file.strip()
  103. if sha256sum:
  104. stat = s3.stat_object(s3_bucket, obj)
  105. sha256sum_value = stat.metadata[get_metadata_name("SHA256SUM")]
  106. file = f'{sha256sum_value} {file}'
  107. found_files.append(file)
  108. print(os.linesep.join(found_files))
  109. @cli.command(name="match")
  110. @click.pass_context
  111. @click.argument('files', nargs=-1)
  112. def check_matched_files_hashes(ctx, files):
  113. s3_bucket, s3 = prep_s3(ctx)
  114. matching_files: List[str] = []
  115. if ctx.obj['READ_STDIN']:
  116. files = get_clean_stdin_iterator(click.get_text_stream('stdin'))
  117. for file in files:
  118. file_identity = f'{get_file_identity(ctx.obj, file)}.json'
  119. try:
  120. stored_file_hash, calculated_file_hash = get_file_sha256sum(s3, s3_bucket, file_identity, file)
  121. if calculated_file_hash == stored_file_hash:
  122. matching_files.append(file)
  123. except NoSuchKey as e:
  124. continue
  125. except ValueError or ResponseError as e:
  126. print(f'ERROR: {file} {e}')
  127. print(os.linesep.join(matching_files))
  128. @cli.command(name="check")
  129. @click.pass_context
  130. @click.argument('files', nargs=-1)
  131. def check_changed_files_hashes(ctx, files):
  132. s3_bucket, s3 = prep_s3(ctx)
  133. changed_files: List[str] = []
  134. if ctx.obj['READ_STDIN']:
  135. files = get_clean_stdin_iterator(click.get_text_stream('stdin'))
  136. for file in files:
  137. file_identity = f'{get_file_identity(ctx.obj, file)}.json'
  138. try:
  139. stored_file_hash, calculated_file_hash = get_file_sha256sum(s3, s3_bucket, file_identity, file)
  140. if calculated_file_hash != stored_file_hash:
  141. changed_files.append(file)
  142. except NoSuchKey as e:
  143. changed_files.append(file)
  144. except ValueError or ResponseError as e:
  145. print(f'ERROR: {file} {e}')
  146. print(os.linesep.join(changed_files))
  147. @cli.command(name="update")
  148. @click.pass_context
  149. @click.argument('files', nargs=-1)
  150. def update_changed_files_hashes(ctx, files):
  151. s3_bucket, s3 = prep_s3(ctx)
  152. updated_files: List[str] = []
  153. if ctx.obj['READ_STDIN']:
  154. files = get_clean_stdin_iterator(click.get_text_stream('stdin'))
  155. for file in files:
  156. file_identity = f'{get_file_identity(ctx.obj, file)}.json'
  157. try:
  158. sha256sum = hashlib.sha256()
  159. with open(file, 'rb') as f:
  160. for byte_block in iter(lambda: f.read(BUF_SIZE), b''):
  161. sha256sum.update(byte_block)
  162. calculated_file_hash = sha256sum.hexdigest()
  163. object_data = {
  164. "path": file
  165. }
  166. with io.BytesIO(json.dumps(object_data, sort_keys=True, indent=None).encode('utf-8')) as data:
  167. data.seek(0, os.SEEK_END)
  168. data_length = data.tell()
  169. data.seek(0)
  170. s3.put_object(
  171. s3_bucket,
  172. file_identity,
  173. data,
  174. data_length,
  175. content_type="application/json",
  176. metadata={
  177. "SHA256SUM": calculated_file_hash
  178. }
  179. )
  180. updated_files.append(file)
  181. except ValueError or ResponseError as e:
  182. print(f'ERROR: {file} {e}')
  183. print(os.linesep.join(updated_files))
  184. @cli.command(name="store")
  185. @click.pass_context
  186. @click.argument('files', nargs=-1)
  187. def store_files(ctx, files):
  188. s3_bucket, s3 = prep_s3(ctx)
  189. stored_files: List[str] = []
  190. if ctx.obj['READ_STDIN']:
  191. files = get_clean_stdin_iterator(click.get_text_stream('stdin'))
  192. for file in files:
  193. file_identity = get_file_identity(ctx.obj, file)
  194. try:
  195. s3.fput_object(
  196. s3_bucket,
  197. file_identity,
  198. file,
  199. content_type="application/octet-stream"
  200. )
  201. stored_files.append(file)
  202. except ResponseError as e:
  203. print(f'ERROR: {file} {e}')
  204. print(os.linesep.join(stored_files))
  205. @cli.command(name="retrieve")
  206. @click.pass_context
  207. @click.option('-d', '--destination', default=None)
  208. @click.argument('files', nargs=-1)
  209. def retrieve_files(ctx, destination, files):
  210. s3_bucket, s3 = prep_s3(ctx)
  211. retrieved_files: List[str] = []
  212. if ctx.obj['READ_STDIN']:
  213. files = get_clean_stdin_iterator(click.get_text_stream('stdin'))
  214. for file in files:
  215. file_identity = get_file_identity(ctx.obj, file)
  216. file_destination = file
  217. if destination is not None:
  218. file_destination = os.path.join(destination, file_identity)
  219. try:
  220. s3.fget_object(
  221. s3_bucket,
  222. file_identity,
  223. file_destination
  224. )
  225. retrieved_files.append(file_destination)
  226. except ResponseError as e:
  227. print(f'ERROR: {file_destination} {e}')
  228. print(os.linesep.join(retrieved_files))
  229. if __name__ == '__main__':
  230. cli(obj={})