comparison env/lib/python3.9/site-packages/bagit.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import, division, print_function, unicode_literals
5
6 import argparse
7 import codecs
8 import gettext
9 import hashlib
10 import logging
11 import multiprocessing
12 import os
13 import re
14 import signal
15 import sys
16 import tempfile
17 import unicodedata
18 import warnings
19 from collections import defaultdict
20 from datetime import date
21 from functools import partial
22 from os.path import abspath, isdir, isfile, join
23
24 from pkg_resources import DistributionNotFound, get_distribution
25
26 try:
27 from urllib.parse import urlparse
28 except ImportError:
29 from urlparse import urlparse
30
31
32 def find_locale_dir():
33 for prefix in (os.path.dirname(__file__), sys.prefix):
34 locale_dir = os.path.join(prefix, "locale")
35 if os.path.isdir(locale_dir):
36 return locale_dir
37
38
39 TRANSLATION_CATALOG = gettext.translation(
40 "bagit-python", localedir=find_locale_dir(), fallback=True
41 )
42 if sys.version_info < (3,):
43 _ = TRANSLATION_CATALOG.ugettext
44 else:
45 _ = TRANSLATION_CATALOG.gettext
46
47 MODULE_NAME = "bagit" if __name__ == "__main__" else __name__
48
49 LOGGER = logging.getLogger(MODULE_NAME)
50
51 try:
52 VERSION = get_distribution(MODULE_NAME).version
53 except DistributionNotFound:
54 VERSION = "0.0.dev0"
55
56 PROJECT_URL = "https://github.com/LibraryOfCongress/bagit-python"
57
58 __doc__ = (
59 _(
60 """
61 BagIt is a directory, filename convention for bundling an arbitrary set of
62 files with a manifest, checksums, and additional metadata. More about BagIt
63 can be found at:
64
65 http://purl.org/net/bagit
66
67 bagit.py is a pure python drop in library and command line tool for creating,
68 and working with BagIt directories.
69
70
71 Command-Line Usage:
72
73 Basic usage is to give bagit.py a directory to bag up:
74
75 $ bagit.py my_directory
76
77 This does a bag-in-place operation where the current contents will be moved
78 into the appropriate BagIt structure and the metadata files will be created.
79
80 You can bag multiple directories if you wish:
81
82 $ bagit.py directory1 directory2
83
84 Optionally you can provide metadata which will be stored in bag-info.txt:
85
86 $ bagit.py --source-organization "Library of Congress" directory
87
88 You can also select which manifest algorithms will be used:
89
90 $ bagit.py --sha1 --md5 --sha256 --sha512 directory
91
92
93 Using BagIt from your Python code:
94
95 import bagit
96 bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'})
97 print(bag.entries)
98
99 For more information or to contribute to bagit-python's development, please
100 visit %(PROJECT_URL)s
101 """
102 )
103 % globals()
104 )
105
106 # standard bag-info.txt metadata
107 STANDARD_BAG_INFO_HEADERS = [
108 "Source-Organization",
109 "Organization-Address",
110 "Contact-Name",
111 "Contact-Phone",
112 "Contact-Email",
113 "External-Description",
114 "External-Identifier",
115 "Bag-Size",
116 "Bag-Group-Identifier",
117 "Bag-Count",
118 "Internal-Sender-Identifier",
119 "Internal-Sender-Description",
120 "BagIt-Profile-Identifier",
121 # Bagging-Date is autogenerated
122 # Payload-Oxum is autogenerated
123 ]
124
125 try:
126 CHECKSUM_ALGOS = hashlib.algorithms_guaranteed
127 except AttributeError:
128 # FIXME: remove when we drop Python 2 (https://github.com/LibraryOfCongress/bagit-python/issues/102)
129 # Python 2.7.0-2.7.8
130 CHECKSUM_ALGOS = set(hashlib.algorithms)
131 DEFAULT_CHECKSUMS = ["sha256", "sha512"]
132
133 #: Block size used when reading files for hashing:
134 HASH_BLOCK_SIZE = 512 * 1024
135
136 #: Convenience function used everywhere we want to open a file to read text
137 #: rather than undecoded bytes:
138 open_text_file = partial(codecs.open, encoding="utf-8", errors="strict")
139
140 # This is the same as decoding the byte values in codecs.BOM:
141 UNICODE_BYTE_ORDER_MARK = "\uFEFF"
142
143
144 def make_bag(
145 bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
146 ):
147 """
148 Convert a given directory into a bag. You can pass in arbitrary
149 key/value pairs to put into the bag-info.txt metadata file as
150 the bag_info dictionary.
151 """
152
153 if checksum is not None:
154 warnings.warn(
155 _(
156 "The `checksum` argument for `make_bag` should be replaced with `checksums`"
157 ),
158 DeprecationWarning,
159 )
160 checksums = checksum
161
162 if checksums is None:
163 checksums = DEFAULT_CHECKSUMS
164
165 bag_dir = os.path.abspath(bag_dir)
166 cwd = os.path.abspath(os.path.curdir)
167
168 if cwd.startswith(bag_dir) and cwd != bag_dir:
169 raise RuntimeError(
170 _("Bagging a parent of the current directory is not supported")
171 )
172
173 LOGGER.info(_("Creating bag for directory %s"), bag_dir)
174
175 if not os.path.isdir(bag_dir):
176 LOGGER.error(_("Bag directory %s does not exist"), bag_dir)
177 raise RuntimeError(_("Bag directory %s does not exist") % bag_dir)
178
179 # FIXME: we should do the permissions checks before changing directories
180 old_dir = os.path.abspath(os.path.curdir)
181
182 try:
183 # TODO: These two checks are currently redundant since an unreadable directory will also
184 # often be unwritable, and this code will require review when we add the option to
185 # bag to a destination other than the source. It would be nice if we could avoid
186 # walking the directory tree more than once even if most filesystems will cache it
187
188 unbaggable = _can_bag(bag_dir)
189
190 if unbaggable:
191 LOGGER.error(
192 _("Unable to write to the following directories and files:\n%s"),
193 unbaggable,
194 )
195 raise BagError(_("Missing permissions to move all files and directories"))
196
197 unreadable_dirs, unreadable_files = _can_read(bag_dir)
198
199 if unreadable_dirs or unreadable_files:
200 if unreadable_dirs:
201 LOGGER.error(
202 _("The following directories do not have read permissions:\n%s"),
203 unreadable_dirs,
204 )
205 if unreadable_files:
206 LOGGER.error(
207 _("The following files do not have read permissions:\n%s"),
208 unreadable_files,
209 )
210 raise BagError(
211 _("Read permissions are required to calculate file fixities")
212 )
213 else:
214 LOGGER.info(_("Creating data directory"))
215
216 # FIXME: if we calculate full paths we won't need to deal with changing directories
217 os.chdir(bag_dir)
218 cwd = os.getcwd()
219 temp_data = tempfile.mkdtemp(dir=cwd)
220
221 for f in os.listdir("."):
222 if os.path.abspath(f) == temp_data:
223 continue
224 new_f = os.path.join(temp_data, f)
225 LOGGER.info(
226 _("Moving %(source)s to %(destination)s"),
227 {"source": f, "destination": new_f},
228 )
229 os.rename(f, new_f)
230
231 LOGGER.info(
232 _("Moving %(source)s to %(destination)s"),
233 {"source": temp_data, "destination": "data"},
234 )
235 os.rename(temp_data, "data")
236
237 # permissions for the payload directory should match those of the
238 # original directory
239 os.chmod("data", os.stat(cwd).st_mode)
240
241 total_bytes, total_files = make_manifests(
242 "data", processes, algorithms=checksums, encoding=encoding
243 )
244
245 LOGGER.info(_("Creating bagit.txt"))
246 txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n"""
247 with open_text_file("bagit.txt", "w") as bagit_file:
248 bagit_file.write(txt)
249
250 LOGGER.info(_("Creating bag-info.txt"))
251 if bag_info is None:
252 bag_info = {}
253
254 # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden
255 if "Bagging-Date" not in bag_info:
256 bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d")
257 if "Bag-Software-Agent" not in bag_info:
258 bag_info["Bag-Software-Agent"] = "bagit.py v%s <%s>" % (
259 VERSION,
260 PROJECT_URL,
261 )
262
263 bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)
264 _make_tag_file("bag-info.txt", bag_info)
265
266 for c in checksums:
267 _make_tagmanifest_file(c, bag_dir, encoding="utf-8")
268 except Exception:
269 LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir)
270 raise
271 finally:
272 os.chdir(old_dir)
273
274 return Bag(bag_dir)
275
276
277 class Bag(object):
278 """A representation of a bag."""
279
280 valid_files = ["bagit.txt", "fetch.txt"]
281 valid_directories = ["data"]
282
283 def __init__(self, path=None):
284 super(Bag, self).__init__()
285 self.tags = {}
286 self.info = {}
287 #: Dictionary of manifest entries and the checksum values for each
288 #: algorithm:
289 self.entries = {}
290
291 # To reliably handle Unicode normalization differences, we maintain
292 # lookup dictionaries in both directions for the filenames read from
293 # the filesystem and the manifests so we can handle cases where the
294 # normalization form changed between the bag being created and read.
295 # See https://github.com/LibraryOfCongress/bagit-python/issues/51.
296
297 #: maps Unicode-normalized values to the raw value from the filesystem
298 self.normalized_filesystem_names = {}
299
300 #: maps Unicode-normalized values to the raw value in the manifest
301 self.normalized_manifest_names = {}
302
303 self.algorithms = []
304 self.tag_file_name = None
305 self.path = abspath(path)
306 if path:
307 # if path ends in a path separator, strip it off
308 if path[-1] == os.sep:
309 self.path = path[:-1]
310 self._open()
311
312 def __str__(self):
313 # FIXME: develop a more informative string representation for a Bag
314 return self.path
315
316 @property
317 def algs(self):
318 warnings.warn(_("Use Bag.algorithms instead of Bag.algs"), DeprecationWarning)
319 return self.algorithms
320
321 @property
322 def version(self):
323 warnings.warn(
324 _("Use the Bag.version_info tuple instead of Bag.version"),
325 DeprecationWarning,
326 )
327 return self._version
328
329 def _open(self):
330 # Open the bagit.txt file, and load any tags from it, including
331 # the required version and encoding.
332 bagit_file_path = os.path.join(self.path, "bagit.txt")
333
334 if not isfile(bagit_file_path):
335 raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path)
336
337 self.tags = tags = _load_tag_file(bagit_file_path)
338
339 required_tags = ("BagIt-Version", "Tag-File-Character-Encoding")
340 missing_tags = [i for i in required_tags if i not in tags]
341 if missing_tags:
342 raise BagError(
343 _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags)
344 )
345
346 # To avoid breaking existing code we'll leave self.version as the string
347 # and parse it into a numeric version_info tuple. In version 2.0 we can
348 # break that.
349
350 self._version = tags["BagIt-Version"]
351
352 try:
353 self.version_info = tuple(int(i) for i in self._version.split(".", 1))
354 except ValueError:
355 raise BagError(
356 _("Bag version numbers must be MAJOR.MINOR numbers, not %s")
357 % self._version
358 )
359
360 if (0, 93) <= self.version_info <= (0, 95):
361 self.tag_file_name = "package-info.txt"
362 elif (0, 96) <= self.version_info < (2,):
363 self.tag_file_name = "bag-info.txt"
364 else:
365 raise BagError(_("Unsupported bag version: %s") % self._version)
366
367 self.encoding = tags["Tag-File-Character-Encoding"]
368
369 try:
370 codecs.lookup(self.encoding)
371 except LookupError:
372 raise BagValidationError(_("Unsupported encoding: %s") % self.encoding)
373
374 info_file_path = os.path.join(self.path, self.tag_file_name)
375 if os.path.exists(info_file_path):
376 self.info = _load_tag_file(info_file_path, encoding=self.encoding)
377
378 self._load_manifests()
379
380 def manifest_files(self):
381 for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]:
382 f = os.path.join(self.path, filename)
383 if isfile(f):
384 yield f
385
386 def tagmanifest_files(self):
387 for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]:
388 f = os.path.join(self.path, filename)
389 if isfile(f):
390 yield f
391
392 def compare_manifests_with_fs(self):
393 """
394 Compare the filenames in the manifests to the filenames present on the
395 local filesystem and returns two lists of the files which are only
396 present in the manifests and the files which are only present on the
397 local filesystem, respectively.
398 """
399
400 # We compare the filenames after Unicode normalization so we can
401 # reliably detect normalization changes after bag creation:
402 files_on_fs = set(normalize_unicode(i) for i in self.payload_files())
403 files_in_manifest = set(
404 normalize_unicode(i) for i in self.payload_entries().keys()
405 )
406
407 if self.version_info >= (0, 97):
408 files_in_manifest.update(self.missing_optional_tagfiles())
409
410 only_on_fs = list()
411 only_in_manifest = list()
412
413 for i in files_on_fs.difference(files_in_manifest):
414 only_on_fs.append(self.normalized_filesystem_names[i])
415
416 for i in files_in_manifest.difference(files_on_fs):
417 only_in_manifest.append(self.normalized_manifest_names[i])
418
419 return only_in_manifest, only_on_fs
420
421 def compare_fetch_with_fs(self):
422 """Compares the fetch entries with the files actually
423 in the payload, and returns a list of all the files
424 that still need to be fetched.
425 """
426
427 files_on_fs = set(self.payload_files())
428 files_in_fetch = set(self.files_to_be_fetched())
429
430 return list(files_in_fetch - files_on_fs)
431
432 def payload_files(self):
433 """Returns a list of filenames which are present on the local filesystem"""
434 payload_dir = os.path.join(self.path, "data")
435
436 for dirpath, _, filenames in os.walk(payload_dir):
437 for f in filenames:
438 # Jump through some hoops here to make the payload files are
439 # returned with the directory structure relative to the base
440 # directory rather than the
441 normalized_f = os.path.normpath(f)
442 rel_path = os.path.relpath(
443 os.path.join(dirpath, normalized_f), start=self.path
444 )
445
446 self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path
447 yield rel_path
448
449 def payload_entries(self):
450 """Return a dictionary of items """
451 # Don't use dict comprehension (compatibility with Python < 2.7)
452 return dict(
453 (key, value)
454 for (key, value) in self.entries.items()
455 if key.startswith("data" + os.sep)
456 )
457
458 def save(self, processes=1, manifests=False):
459 """
460 save will persist any changes that have been made to the bag
461 metadata (self.info).
462
463 If you have modified the payload of the bag (added, modified,
464 removed files in the data directory) and want to regenerate manifests
465 set the manifests parameter to True. The default is False since you
466 wouldn't want a save to accidentally create a new manifest for
467 a corrupted bag.
468
469 If you want to control the number of processes that are used when
470 recalculating checksums use the processes parameter.
471 """
472 # Error checking
473 if not self.path:
474 raise BagError(_("Bag.save() called before setting the path!"))
475
476 if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK):
477 raise BagError(
478 _("Cannot save bag to non-existent or inaccessible directory %s")
479 % self.path
480 )
481
482 unbaggable = _can_bag(self.path)
483 if unbaggable:
484 LOGGER.error(
485 _(
486 "Missing write permissions for the following directories and files:\n%s"
487 ),
488 unbaggable,
489 )
490 raise BagError(_("Missing permissions to move all files and directories"))
491
492 unreadable_dirs, unreadable_files = _can_read(self.path)
493 if unreadable_dirs or unreadable_files:
494 if unreadable_dirs:
495 LOGGER.error(
496 _("The following directories do not have read permissions:\n%s"),
497 unreadable_dirs,
498 )
499 if unreadable_files:
500 LOGGER.error(
501 _("The following files do not have read permissions:\n%s"),
502 unreadable_files,
503 )
504 raise BagError(
505 _("Read permissions are required to calculate file fixities")
506 )
507
508 # Change working directory to bag directory so helper functions work
509 old_dir = os.path.abspath(os.path.curdir)
510 os.chdir(self.path)
511
512 # Generate new manifest files
513 if manifests:
514 total_bytes, total_files = make_manifests(
515 "data", processes, algorithms=self.algorithms, encoding=self.encoding
516 )
517
518 # Update Payload-Oxum
519 LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name)
520 self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)
521
522 _make_tag_file(self.tag_file_name, self.info)
523
524 # Update tag-manifest for changes to manifest & bag-info files
525 for alg in self.algorithms:
526 _make_tagmanifest_file(alg, self.path, encoding=self.encoding)
527
528 # Reload the manifests
529 self._load_manifests()
530
531 os.chdir(old_dir)
532
533 def tagfile_entries(self):
534 return dict(
535 (key, value)
536 for (key, value) in self.entries.items()
537 if not key.startswith("data" + os.sep)
538 )
539
540 def missing_optional_tagfiles(self):
541 """
542 From v0.97 we need to validate any tagfiles listed
543 in the optional tagmanifest(s). As there is no mandatory
544 directory structure for additional tagfiles we can
545 only check for entries with missing files (not missing
546 entries for existing files).
547 """
548 for tagfilepath in self.tagfile_entries().keys():
549 if not os.path.isfile(os.path.join(self.path, tagfilepath)):
550 yield tagfilepath
551
552 def fetch_entries(self):
553 """Load fetch.txt if present and iterate over its contents
554
555 yields (url, size, filename) tuples
556
557 raises BagError for errors such as an unsafe filename referencing
558 data outside of the bag directory
559 """
560
561 fetch_file_path = os.path.join(self.path, "fetch.txt")
562
563 if isfile(fetch_file_path):
564 with open_text_file(
565 fetch_file_path, "r", encoding=self.encoding
566 ) as fetch_file:
567 for line in fetch_file:
568 url, file_size, filename = line.strip().split(None, 2)
569
570 if self._path_is_dangerous(filename):
571 raise BagError(
572 _('Path "%(payload_file)s" in "%(source_file)s" is unsafe')
573 % {
574 "payload_file": filename,
575 "source_file": os.path.join(self.path, "fetch.txt"),
576 }
577 )
578
579 yield url, file_size, filename
580
581 def files_to_be_fetched(self):
582 """
583 Convenience wrapper for fetch_entries which returns only the
584 local filename
585 """
586
587 for url, file_size, filename in self.fetch_entries():
588 yield filename
589
590 def has_oxum(self):
591 return "Payload-Oxum" in self.info
592
593 def validate(self, processes=1, fast=False, completeness_only=False):
594 """Checks the structure and contents are valid.
595
596 If you supply the parameter fast=True the Payload-Oxum (if present) will
597 be used to check that the payload files are present and accounted for,
598 instead of re-calculating fixities and comparing them against the
599 manifest. By default validate() will re-calculate fixities (fast=False).
600 """
601
602 self._validate_structure()
603 self._validate_bagittxt()
604
605 self.validate_fetch()
606
607 self._validate_contents(
608 processes=processes, fast=fast, completeness_only=completeness_only
609 )
610
611 return True
612
613 def is_valid(self, fast=False, completeness_only=False):
614 """Returns validation success or failure as boolean.
615 Optional fast parameter passed directly to validate().
616 """
617
618 try:
619 self.validate(fast=fast, completeness_only=completeness_only)
620 except BagError:
621 return False
622
623 return True
624
625 def _load_manifests(self):
626 self.entries = {}
627 manifests = list(self.manifest_files())
628
629 if self.version_info >= (0, 97):
630 # v0.97+ requires that optional tagfiles are verified.
631 manifests += list(self.tagmanifest_files())
632
633 for manifest_filename in manifests:
634 if manifest_filename.find("tagmanifest-") != -1:
635 search = "tagmanifest-"
636 else:
637 search = "manifest-"
638 alg = (
639 os.path.basename(manifest_filename)
640 .replace(search, "")
641 .replace(".txt", "")
642 )
643 if alg not in self.algorithms:
644 self.algorithms.append(alg)
645
646 with open_text_file(
647 manifest_filename, "r", encoding=self.encoding
648 ) as manifest_file:
649 if manifest_file.encoding.startswith("UTF"):
650 # We'll check the first character to see if it's a BOM:
651 if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK:
652 # We'll skip it either way by letting line decoding
653 # happen at the new offset but we will issue a warning
654 # for UTF-8 since the presence of a BOM is contrary to
655 # the BagIt specification:
656 if manifest_file.encoding == "UTF-8":
657 LOGGER.warning(
658 _(
659 "%s is encoded using UTF-8 but contains an unnecessary"
660 " byte-order mark, which is not in compliance with the"
661 " BagIt RFC"
662 ),
663 manifest_file.name,
664 )
665 else:
666 manifest_file.seek(0) # Pretend the first read never happened
667
668 for line in manifest_file:
669 line = line.strip()
670
671 # Ignore blank lines and comments.
672 if line == "" or line.startswith("#"):
673 continue
674
675 entry = line.split(None, 1)
676
677 # Format is FILENAME *CHECKSUM
678 if len(entry) != 2:
679 LOGGER.error(
680 _(
681 "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s"
682 ),
683 {"bag": self, "algorithm": alg, "line": line},
684 )
685 continue
686
687 entry_hash = entry[0]
688 entry_path = os.path.normpath(entry[1].lstrip("*"))
689 entry_path = _decode_filename(entry_path)
690
691 if self._path_is_dangerous(entry_path):
692 raise BagError(
693 _(
694 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe'
695 )
696 % {
697 "payload_file": entry_path,
698 "manifest_file": manifest_file.name,
699 }
700 )
701
702 entry_hashes = self.entries.setdefault(entry_path, {})
703
704 if alg in entry_hashes:
705 warning_ctx = {
706 "bag": self,
707 "algorithm": alg,
708 "filename": entry_path,
709 }
710 if entry_hashes[alg] == entry_hash:
711 msg = _(
712 "%(bag)s: %(algorithm)s manifest lists %(filename)s"
713 " multiple times with the same value"
714 )
715 if self.version_info >= (1,):
716 raise BagError(msg % warning_ctx)
717 else:
718 LOGGER.warning(msg, warning_ctx)
719 else:
720 raise BagError(
721 _(
722 "%(bag)s: %(algorithm)s manifest lists %(filename)s"
723 " multiple times with conflicting values"
724 )
725 % warning_ctx
726 )
727
728 entry_hashes[alg] = entry_hash
729
730 self.normalized_manifest_names.update(
731 (normalize_unicode(i), i) for i in self.entries.keys()
732 )
733
734 def _validate_structure(self):
735 """
736 Checks the structure of the bag to determine whether it conforms to the
737 BagIt spec. Returns true on success, otherwise it will raise a
738 BagValidationError exception.
739 """
740
741 self._validate_structure_payload_directory()
742 self._validate_structure_tag_files()
743
744 def _validate_structure_payload_directory(self):
745 data_dir_path = os.path.join(self.path, "data")
746
747 if not isdir(data_dir_path):
748 raise BagValidationError(
749 _("Expected data directory %s does not exist") % data_dir_path
750 )
751
752 def _validate_structure_tag_files(self):
753 # Note: we deviate somewhat from v0.96 of the spec in that it allows
754 # other files and directories to be present in the base directory
755
756 if not list(self.manifest_files()):
757 raise BagValidationError(_("No manifest files found"))
758 if "bagit.txt" not in os.listdir(self.path):
759 raise BagValidationError(
760 _('Expected %s to contain "bagit.txt"') % self.path
761 )
762
763 def validate_fetch(self):
764 """Validate the fetch.txt file
765
766 Raises `BagError` for errors and otherwise returns no value
767 """
768
769 for url, file_size, filename in self.fetch_entries():
770 # fetch_entries will raise a BagError for unsafe filenames
771 # so at this point we will check only that the URL is minimally
772 # well formed:
773 parsed_url = urlparse(url)
774
775 if not all((parsed_url.scheme, parsed_url.netloc)):
776 raise BagError(_("Malformed URL in fetch.txt: %s") % url)
777
778 def _validate_contents(self, processes=1, fast=False, completeness_only=False):
779 if fast and not self.has_oxum():
780 raise BagValidationError(
781 _("Fast validation requires bag-info.txt to include Payload-Oxum")
782 )
783
784 # Perform the fast file count + size check so we can fail early:
785 self._validate_oxum()
786
787 if fast:
788 return
789
790 self._validate_completeness()
791
792 if completeness_only:
793 return
794
795 self._validate_entries(processes)
796
797 def _validate_oxum(self):
798 oxum = self.info.get("Payload-Oxum")
799
800 if oxum is None:
801 return
802
803 # If multiple Payload-Oxum tags (bad idea)
804 # use the first listed in bag-info.txt
805 if isinstance(oxum, list):
806 LOGGER.warning(_("bag-info.txt defines multiple Payload-Oxum values!"))
807 oxum = oxum[0]
808
809 oxum_byte_count, oxum_file_count = oxum.split(".", 1)
810
811 if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit():
812 raise BagError(_("Malformed Payload-Oxum value: %s") % oxum)
813
814 oxum_byte_count = int(oxum_byte_count)
815 oxum_file_count = int(oxum_file_count)
816 total_bytes = 0
817 total_files = 0
818
819 for payload_file in self.payload_files():
820 payload_file = os.path.join(self.path, payload_file)
821 total_bytes += os.stat(payload_file).st_size
822 total_files += 1
823
824 if oxum_file_count != total_files or oxum_byte_count != total_bytes:
825 raise BagValidationError(
826 _(
827 "Payload-Oxum validation failed."
828 " Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes"
829 " but found %(found_file_count)d files and %(found_byte_count)d bytes"
830 )
831 % {
832 "found_file_count": total_files,
833 "found_byte_count": total_bytes,
834 "oxum_file_count": oxum_file_count,
835 "oxum_byte_count": oxum_byte_count,
836 }
837 )
838
839 def _validate_completeness(self):
840 """
841 Verify that the actual file manifests match the files in the data directory
842 """
843 errors = list()
844
845 # First we'll make sure there's no mismatch between the filesystem
846 # and the list of files in the manifest(s)
847 only_in_manifests, only_on_fs = self.compare_manifests_with_fs()
848 for path in only_in_manifests:
849 e = FileMissing(path)
850 LOGGER.warning(force_unicode(e))
851 errors.append(e)
852 for path in only_on_fs:
853 e = UnexpectedFile(path)
854 LOGGER.warning(force_unicode(e))
855 errors.append(e)
856
857 if errors:
858 raise BagValidationError(_("Bag validation failed"), errors)
859
860 def _validate_entries(self, processes):
861 """
862 Verify that the actual file contents match the recorded hashes stored in the manifest files
863 """
864 errors = list()
865
866 if os.name == "posix":
867 worker_init = posix_multiprocessing_worker_initializer
868 else:
869 worker_init = None
870
871 args = (
872 (
873 self.path,
874 self.normalized_filesystem_names.get(rel_path, rel_path),
875 hashes,
876 self.algorithms,
877 )
878 for rel_path, hashes in self.entries.items()
879 )
880
881 try:
882 if processes == 1:
883 hash_results = [_calc_hashes(i) for i in args]
884 else:
885 try:
886 pool = multiprocessing.Pool(
887 processes if processes else None, initializer=worker_init
888 )
889 hash_results = pool.map(_calc_hashes, args)
890 finally:
891 pool.terminate()
892
893 # Any unhandled exceptions are probably fatal
894 except:
895 LOGGER.exception(_("Unable to calculate file hashes for %s"), self)
896 raise
897
898 for rel_path, f_hashes, hashes in hash_results:
899 for alg, computed_hash in f_hashes.items():
900 stored_hash = hashes[alg]
901 if stored_hash.lower() != computed_hash:
902 e = ChecksumMismatch(
903 rel_path, alg, stored_hash.lower(), computed_hash
904 )
905 LOGGER.warning(force_unicode(e))
906 errors.append(e)
907
908 if errors:
909 raise BagValidationError(_("Bag validation failed"), errors)
910
911 def _validate_bagittxt(self):
912 """
913 Verify that bagit.txt conforms to specification
914 """
915 bagit_file_path = os.path.join(self.path, "bagit.txt")
916
917 # Note that we are intentionally opening this file in binary mode so we can confirm
918 # that it does not start with the UTF-8 byte-order-mark
919 with open(bagit_file_path, "rb") as bagit_file:
920 first_line = bagit_file.read(4)
921 if first_line.startswith(codecs.BOM_UTF8):
922 raise BagValidationError(
923 _("bagit.txt must not contain a byte-order mark")
924 )
925
926 def _path_is_dangerous(self, path):
927 """
928 Return true if path looks dangerous, i.e. potentially operates
929 outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json,
930 \\?\c:\, D:\sys32\cmd.exe
931 """
932 if os.path.isabs(path):
933 return True
934 if os.path.expanduser(path) != path:
935 return True
936 if os.path.expandvars(path) != path:
937 return True
938 real_path = os.path.realpath(os.path.join(self.path, path))
939 real_path = os.path.normpath(real_path)
940 bag_path = os.path.realpath(self.path)
941 bag_path = os.path.normpath(bag_path)
942 common = os.path.commonprefix((bag_path, real_path))
943 return not (common == bag_path)
944
945
946 class BagError(Exception):
947 pass
948
949
950 class BagValidationError(BagError):
951 def __init__(self, message, details=None):
952 super(BagValidationError, self).__init__()
953
954 if details is None:
955 details = []
956
957 self.message = message
958 self.details = details
959
960 def __str__(self):
961 if len(self.details) > 0:
962 details = "; ".join([force_unicode(e) for e in self.details])
963 return "%s: %s" % (self.message, details)
964 return self.message
965
966
967 class ManifestErrorDetail(BagError):
968 def __init__(self, path):
969 super(ManifestErrorDetail, self).__init__()
970
971 self.path = path
972
973
974 class ChecksumMismatch(ManifestErrorDetail):
975 def __init__(self, path, algorithm=None, expected=None, found=None):
976 super(ChecksumMismatch, self).__init__(path)
977
978 self.path = path
979 self.algorithm = algorithm
980 self.expected = expected
981 self.found = found
982
983 def __str__(self):
984 return _(
985 '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"'
986 ) % {
987 "path": force_unicode(self.path),
988 "algorithm": self.algorithm,
989 "expected": self.expected,
990 "found": self.found,
991 }
992
993
994 class FileMissing(ManifestErrorDetail):
995 def __str__(self):
996 return _(
997 "%s exists in manifest but was not found on filesystem"
998 ) % force_unicode(self.path)
999
1000
1001 class UnexpectedFile(ManifestErrorDetail):
1002 def __str__(self):
1003 return _("%s exists on filesystem but is not in the manifest") % self.path
1004
1005
1006 class FileNormalizationConflict(BagError):
1007 """
1008 Exception raised when two files differ only in normalization and thus
1009 are not safely portable
1010 """
1011
1012 def __init__(self, file_a, file_b):
1013 super(FileNormalizationConflict, self).__init__()
1014
1015 self.file_a = file_a
1016 self.file_b = file_b
1017
1018 def __str__(self):
1019 return _(
1020 'Unicode normalization conflict for file "%(file_a)s" and "%(file_b)s"'
1021 ) % {"file_a": self.file_a, "file_b": self.file_b}
1022
1023
1024 def posix_multiprocessing_worker_initializer():
1025 """Ignore SIGINT in multiprocessing workers on POSIX systems"""
1026 signal.signal(signal.SIGINT, signal.SIG_IGN)
1027
1028
1029 # The Unicode normalization form used here doesn't matter – all we care about
1030 # is consistency since the input value will be preserved:
1031
1032
1033 def normalize_unicode_py3(s):
1034 return unicodedata.normalize("NFC", s)
1035
1036
1037 def normalize_unicode_py2(s):
1038 if isinstance(s, str):
1039 s = s.decode("utf-8")
1040 return unicodedata.normalize("NFC", s)
1041
1042
1043 if sys.version_info > (3, 0):
1044 normalize_unicode = normalize_unicode_py3
1045 else:
1046 normalize_unicode = normalize_unicode_py2
1047
1048
1049 def build_unicode_normalized_lookup_dict(filenames):
1050 """
1051 Return a dictionary mapping unicode-normalized filenames to as-encoded
1052 values to efficiently detect conflicts between the filesystem and manifests.
1053
1054 This is necessary because some filesystems and utilities may automatically
1055 apply a different Unicode normalization form to filenames than was applied
1056 when the bag was originally created.
1057
1058 The best known example of this is when a bag is created using a
1059 normalization form other than NFD and then transferred to a Mac where the
1060 HFS+ filesystem will transparently normalize filenames to a variant of NFD
1061 for every call:
1062
1063 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html#UnicodeSubtleties
1064
1065 Windows is documented as storing filenames exactly as provided:
1066
1067 https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx
1068
1069 Linux performs no normalization in the kernel but it is technically
1070 valid for a filesystem to perform normalization, such as when an HFS+
1071 volume is mounted.
1072
1073 See http://www.unicode.org/reports/tr15/ for a full discussion of
1074 equivalence and normalization in Unicode.
1075 """
1076
1077 output = dict()
1078
1079 for filename in filenames:
1080 normalized_filename = normalize_unicode(filename)
1081 if normalized_filename in output:
1082 raise FileNormalizationConflict(filename, output[normalized_filename])
1083 else:
1084 output[normalized_filename] = filename
1085
1086 return output
1087
1088
1089 def get_hashers(algorithms):
1090 """
1091 Given a list of algorithm names, return a dictionary of hasher instances
1092
1093 This avoids redundant code between the creation and validation code where in
1094 both cases we want to avoid reading the same file more than once. The
1095 intended use is a simple for loop:
1096
1097 for block in file:
1098 for hasher in hashers.values():
1099 hasher.update(block)
1100 """
1101
1102 hashers = {}
1103
1104 for alg in algorithms:
1105 try:
1106 hasher = hashlib.new(alg)
1107 except ValueError:
1108 LOGGER.warning(
1109 _("Disabling requested hash algorithm %s: hashlib does not support it"),
1110 alg,
1111 )
1112 continue
1113
1114 hashers[alg] = hasher
1115
1116 if not hashers:
1117 raise ValueError(
1118 _(
1119 "Unable to continue: hashlib does not support any of the requested algorithms!"
1120 )
1121 )
1122
1123 return hashers
1124
1125
1126 def _calc_hashes(args):
1127 # auto unpacking of sequences illegal in Python3
1128 (base_path, rel_path, hashes, algorithms) = args
1129 full_path = os.path.join(base_path, rel_path)
1130
1131 # Create a clone of the default empty hash objects:
1132 f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms)
1133
1134 try:
1135 f_hashes = _calculate_file_hashes(full_path, f_hashers)
1136 except BagValidationError as e:
1137 f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys())
1138
1139 return rel_path, f_hashes, hashes
1140
1141
1142 def _calculate_file_hashes(full_path, f_hashers):
1143 """
1144 Returns a dictionary of (algorithm, hexdigest) values for the provided
1145 filename
1146 """
1147 LOGGER.info(_("Verifying checksum for file %s"), full_path)
1148
1149 try:
1150 with open(full_path, "rb") as f:
1151 while True:
1152 block = f.read(HASH_BLOCK_SIZE)
1153 if not block:
1154 break
1155 for i in f_hashers.values():
1156 i.update(block)
1157 except (OSError, IOError) as e:
1158 raise BagValidationError(
1159 _("Could not read %(filename)s: %(error)s")
1160 % {"filename": full_path, "error": force_unicode(e)}
1161 )
1162
1163 return dict((alg, h.hexdigest()) for alg, h in f_hashers.items())
1164
1165
1166 def _load_tag_file(tag_file_name, encoding="utf-8-sig"):
1167 with open_text_file(tag_file_name, "r", encoding=encoding) as tag_file:
1168 # Store duplicate tags as list of vals
1169 # in order of parsing under the same key.
1170 tags = {}
1171 for name, value in _parse_tags(tag_file):
1172 if name not in tags:
1173 tags[name] = value
1174 continue
1175
1176 if not isinstance(tags[name], list):
1177 tags[name] = [tags[name], value]
1178 else:
1179 tags[name].append(value)
1180
1181 return tags
1182
1183
1184 def _parse_tags(tag_file):
1185 """Parses a tag file, according to RFC 2822. This
1186 includes line folding, permitting extra-long
1187 field values.
1188
1189 See http://www.faqs.org/rfcs/rfc2822.html for
1190 more information.
1191 """
1192
1193 tag_name = None
1194 tag_value = None
1195
1196 # Line folding is handled by yielding values only after we encounter
1197 # the start of a new tag, or if we pass the EOF.
1198 for num, line in enumerate(tag_file):
1199 # Skip over any empty or blank lines.
1200 if len(line) == 0 or line.isspace():
1201 continue
1202 elif line[0].isspace() and tag_value is not None: # folded line
1203 tag_value += line
1204 else:
1205 # Starting a new tag; yield the last one.
1206 if tag_name:
1207 yield (tag_name, tag_value.strip())
1208
1209 if ":" not in line:
1210 raise BagValidationError(
1211 _("%(filename)s contains invalid tag: %(line)s")
1212 % {
1213 "line": line.strip(),
1214 "filename": os.path.basename(tag_file.name),
1215 }
1216 )
1217
1218 parts = line.strip().split(":", 1)
1219 tag_name = parts[0].strip()
1220 tag_value = parts[1]
1221
1222 # Passed the EOF. All done after this.
1223 if tag_name:
1224 yield (tag_name, tag_value.strip())
1225
1226
1227 def _make_tag_file(bag_info_path, bag_info):
1228 headers = sorted(bag_info.keys())
1229 with open_text_file(bag_info_path, "w") as f:
1230 for h in headers:
1231 values = bag_info[h]
1232 if not isinstance(values, list):
1233 values = [values]
1234 for txt in values:
1235 # strip CR, LF and CRLF so they don't mess up the tag file
1236 txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt))
1237 f.write("%s: %s\n" % (h, txt))
1238
1239
1240 def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"):
1241 LOGGER.info(
1242 _("Using %(process_count)d processes to generate manifests: %(algorithms)s"),
1243 {"process_count": processes, "algorithms": ", ".join(algorithms)},
1244 )
1245
1246 manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms)
1247
1248 if processes > 1:
1249 pool = multiprocessing.Pool(processes=processes)
1250 checksums = pool.map(manifest_line_generator, _walk(data_dir))
1251 pool.close()
1252 pool.join()
1253 else:
1254 checksums = [manifest_line_generator(i) for i in _walk(data_dir)]
1255
1256 # At this point we have a list of tuples which start with the algorithm name:
1257 manifest_data = {}
1258 for batch in checksums:
1259 for entry in batch:
1260 manifest_data.setdefault(entry[0], []).append(entry[1:])
1261
1262 # These will be keyed on the algorithm name so we can perform sanity checks
1263 # below to catch failures in the hashing process:
1264 num_files = defaultdict(lambda: 0)
1265 total_bytes = defaultdict(lambda: 0)
1266
1267 for algorithm, values in manifest_data.items():
1268 manifest_filename = "manifest-%s.txt" % algorithm
1269
1270 with open_text_file(manifest_filename, "w", encoding=encoding) as manifest:
1271 for digest, filename, byte_count in values:
1272 manifest.write("%s %s\n" % (digest, _encode_filename(filename)))
1273 num_files[algorithm] += 1
1274 total_bytes[algorithm] += byte_count
1275
1276 # We'll use sets of the values for the error checks and eventually return the payload oxum values:
1277 byte_value_set = set(total_bytes.values())
1278 file_count_set = set(num_files.values())
1279
1280 # allow a bag with an empty payload
1281 if not byte_value_set and not file_count_set:
1282 return 0, 0
1283
1284 if len(file_count_set) != 1:
1285 raise RuntimeError(_("Expected the same number of files for each checksum"))
1286
1287 if len(byte_value_set) != 1:
1288 raise RuntimeError(_("Expected the same number of bytes for each checksums"))
1289
1290 return byte_value_set.pop(), file_count_set.pop()
1291
1292
1293 def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"):
1294 tagmanifest_file = join(bag_dir, "tagmanifest-%s.txt" % alg)
1295 LOGGER.info(_("Creating %s"), tagmanifest_file)
1296
1297 checksums = []
1298 for f in _find_tag_files(bag_dir):
1299 if re.match(r"^tagmanifest-.+\.txt$", f):
1300 continue
1301 with open(join(bag_dir, f), "rb") as fh:
1302 m = hashlib.new(alg)
1303 while True:
1304 block = fh.read(HASH_BLOCK_SIZE)
1305 if not block:
1306 break
1307 m.update(block)
1308 checksums.append((m.hexdigest(), f))
1309
1310 with open_text_file(
1311 join(bag_dir, tagmanifest_file), mode="w", encoding=encoding
1312 ) as tagmanifest:
1313 for digest, filename in checksums:
1314 tagmanifest.write("%s %s\n" % (digest, filename))
1315
1316
1317 def _find_tag_files(bag_dir):
1318 for dir in os.listdir(bag_dir):
1319 if dir != "data":
1320 if os.path.isfile(dir) and not dir.startswith("tagmanifest-"):
1321 yield dir
1322 for dir_name, _, filenames in os.walk(dir):
1323 for filename in filenames:
1324 if filename.startswith("tagmanifest-"):
1325 continue
1326 # remove everything up to the bag_dir directory
1327 p = join(dir_name, filename)
1328 yield os.path.relpath(p, bag_dir)
1329
1330
1331 def _walk(data_dir):
1332 for dirpath, dirnames, filenames in os.walk(data_dir):
1333 # if we don't sort here the order of entries is non-deterministic
1334 # which makes it hard to test the fixity of tagmanifest-md5.txt
1335 filenames.sort()
1336 dirnames.sort()
1337 for fn in filenames:
1338 path = os.path.join(dirpath, fn)
1339 # BagIt spec requires manifest to always use '/' as path separator
1340 if os.path.sep != "/":
1341 parts = path.split(os.path.sep)
1342 path = "/".join(parts)
1343 yield path
1344
1345
1346 def _can_bag(test_dir):
1347 """Scan the provided directory for files which cannot be bagged due to insufficient permissions"""
1348 unbaggable = []
1349
1350 if not os.access(test_dir, os.R_OK):
1351 # We cannot continue without permission to read the source directory
1352 unbaggable.append(test_dir)
1353 return unbaggable
1354
1355 if not os.access(test_dir, os.W_OK):
1356 unbaggable.append(test_dir)
1357
1358 for dirpath, dirnames, filenames in os.walk(test_dir):
1359 for directory in dirnames:
1360 full_path = os.path.join(dirpath, directory)
1361 if not os.access(full_path, os.W_OK):
1362 unbaggable.append(full_path)
1363
1364 return unbaggable
1365
1366
1367 def _can_read(test_dir):
1368 """
1369 returns ((unreadable_dirs), (unreadable_files))
1370 """
1371 unreadable_dirs = []
1372 unreadable_files = []
1373
1374 if not os.access(test_dir, os.R_OK):
1375 unreadable_dirs.append(test_dir)
1376 else:
1377 for dirpath, dirnames, filenames in os.walk(test_dir):
1378 for dn in dirnames:
1379 full_path = os.path.join(dirpath, dn)
1380 if not os.access(full_path, os.R_OK):
1381 unreadable_dirs.append(full_path)
1382 for fn in filenames:
1383 full_path = os.path.join(dirpath, fn)
1384 if not os.access(full_path, os.R_OK):
1385 unreadable_files.append(full_path)
1386 return (tuple(unreadable_dirs), tuple(unreadable_files))
1387
1388
1389 def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):
1390 LOGGER.info(_("Generating manifest lines for file %s"), filename)
1391
1392 # For performance we'll read the file only once and pass it block
1393 # by block to every requested hash algorithm:
1394 hashers = get_hashers(algorithms)
1395
1396 total_bytes = 0
1397
1398 with open(filename, "rb") as f:
1399 while True:
1400 block = f.read(HASH_BLOCK_SIZE)
1401
1402 if not block:
1403 break
1404
1405 total_bytes += len(block)
1406 for hasher in hashers.values():
1407 hasher.update(block)
1408
1409 decoded_filename = _decode_filename(filename)
1410
1411 # We'll generate a list of results in roughly manifest format but prefixed with the algorithm:
1412 results = [
1413 (alg, hasher.hexdigest(), decoded_filename, total_bytes)
1414 for alg, hasher in hashers.items()
1415 ]
1416
1417 return results
1418
1419
1420 def _encode_filename(s):
1421 s = s.replace("\r", "%0D")
1422 s = s.replace("\n", "%0A")
1423 return s
1424
1425
1426 def _decode_filename(s):
1427 s = re.sub(r"%0D", "\r", s, re.IGNORECASE)
1428 s = re.sub(r"%0A", "\n", s, re.IGNORECASE)
1429 return s
1430
1431
1432 def force_unicode_py2(s):
1433 """Reliably return a Unicode string given a possible unicode or byte string"""
1434 if isinstance(s, str):
1435 return s.decode("utf-8")
1436 else:
1437 return unicode(s)
1438
1439
1440 if sys.version_info > (3, 0):
1441 force_unicode = str
1442 else:
1443 force_unicode = force_unicode_py2
1444
1445 # following code is used for command line program
1446
1447
1448 class BagArgumentParser(argparse.ArgumentParser):
1449 def __init__(self, *args, **kwargs):
1450 argparse.ArgumentParser.__init__(self, *args, **kwargs)
1451 self.set_defaults(bag_info={})
1452
1453
1454 class BagHeaderAction(argparse.Action):
1455 def __call__(self, parser, namespace, values, option_string=None):
1456 opt = option_string.lstrip("--")
1457 opt_caps = "-".join([o.capitalize() for o in opt.split("-")])
1458 namespace.bag_info[opt_caps] = values
1459
1460
1461 def _make_parser():
1462 parser = BagArgumentParser(
1463 formatter_class=argparse.RawDescriptionHelpFormatter,
1464 description="bagit-python version %s\n\n%s\n" % (VERSION, __doc__.strip()),
1465 )
1466 parser.add_argument(
1467 "--processes",
1468 type=int,
1469 dest="processes",
1470 default=1,
1471 help=_(
1472 "Use multiple processes to calculate checksums faster (default: %(default)s)"
1473 ),
1474 )
1475 parser.add_argument("--log", help=_("The name of the log file (default: stdout)"))
1476 parser.add_argument(
1477 "--quiet",
1478 action="store_true",
1479 help=_("Suppress all progress information other than errors"),
1480 )
1481 parser.add_argument(
1482 "--validate",
1483 action="store_true",
1484 help=_(
1485 "Validate existing bags in the provided directories instead of"
1486 " creating new ones"
1487 ),
1488 )
1489 parser.add_argument(
1490 "--fast",
1491 action="store_true",
1492 help=_(
1493 "Modify --validate behaviour to only test whether the bag directory"
1494 " has the number of files and total size specified in Payload-Oxum"
1495 " without performing checksum validation to detect corruption."
1496 ),
1497 )
1498 parser.add_argument(
1499 "--completeness-only",
1500 action="store_true",
1501 help=_(
1502 "Modify --validate behaviour to test whether the bag directory"
1503 " has the expected payload specified in the checksum manifests"
1504 " without performing checksum validation to detect corruption."
1505 ),
1506 )
1507
1508 checksum_args = parser.add_argument_group(
1509 _("Checksum Algorithms"),
1510 _(
1511 "Select the manifest algorithms to be used when creating bags"
1512 " (default=%s)"
1513 )
1514 % ", ".join(DEFAULT_CHECKSUMS),
1515 )
1516
1517 for i in CHECKSUM_ALGOS:
1518 alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper())
1519 checksum_args.add_argument(
1520 "--%s" % i,
1521 action="append_const",
1522 dest="checksums",
1523 const=i,
1524 help=_("Generate %s manifest when creating a bag") % alg_name,
1525 )
1526
1527 metadata_args = parser.add_argument_group(_("Optional Bag Metadata"))
1528 for header in STANDARD_BAG_INFO_HEADERS:
1529 metadata_args.add_argument(
1530 "--%s" % header.lower(), type=str, action=BagHeaderAction, default=argparse.SUPPRESS
1531 )
1532
1533 parser.add_argument(
1534 "directory",
1535 nargs="+",
1536 help=_(
1537 "Directory which will be converted into a bag in place"
1538 " by moving any existing files into the BagIt structure"
1539 " and creating the manifests and other metadata."
1540 ),
1541 )
1542
1543 return parser
1544
1545
1546 def _configure_logging(opts):
1547 log_format = "%(asctime)s - %(levelname)s - %(message)s"
1548 if opts.quiet:
1549 level = logging.ERROR
1550 else:
1551 level = logging.INFO
1552 if opts.log:
1553 logging.basicConfig(filename=opts.log, level=level, format=log_format)
1554 else:
1555 logging.basicConfig(level=level, format=log_format)
1556
1557
1558 def main():
1559 if "--version" in sys.argv:
1560 print(_("bagit-python version %s") % VERSION)
1561 sys.exit(0)
1562
1563 parser = _make_parser()
1564 args = parser.parse_args()
1565
1566 if args.processes < 0:
1567 parser.error(_("The number of processes must be 0 or greater"))
1568
1569 if args.fast and not args.validate:
1570 parser.error(_("--fast is only allowed as an option for --validate!"))
1571
1572 _configure_logging(args)
1573
1574 rc = 0
1575 for bag_dir in args.directory:
1576 # validate the bag
1577 if args.validate:
1578 try:
1579 bag = Bag(bag_dir)
1580 # validate throws a BagError or BagValidationError
1581 bag.validate(
1582 processes=args.processes,
1583 fast=args.fast,
1584 completeness_only=args.completeness_only,
1585 )
1586 if args.fast:
1587 LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir)
1588 else:
1589 LOGGER.info(_("%s is valid"), bag_dir)
1590 except BagError as e:
1591 LOGGER.error(
1592 _("%(bag)s is invalid: %(error)s"), {"bag": bag_dir, "error": e}
1593 )
1594 rc = 1
1595
1596 # make the bag
1597 else:
1598 try:
1599 make_bag(
1600 bag_dir,
1601 bag_info=args.bag_info,
1602 processes=args.processes,
1603 checksums=args.checksums,
1604 )
1605 except Exception as exc:
1606 LOGGER.error(
1607 _("Failed to create bag in %(bag_directory)s: %(error)s"),
1608 {"bag_directory": bag_dir, "error": exc},
1609 exc_info=True,
1610 )
1611 rc = 1
1612
1613 sys.exit(rc)
1614
1615
1616 if __name__ == "__main__":
1617 main()