inline_container.h 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. #pragma once
  2. #include <cerrno>
  3. #include <cstdio>
  4. #include <cstring>
  5. #include <fstream>
  6. #include <istream>
  7. #include <mutex>
  8. #include <ostream>
  9. #include <unordered_set>
  10. #include <c10/core/Allocator.h>
  11. #include <c10/core/Backend.h>
  12. #include "caffe2/serialize/istream_adapter.h"
  13. #include "caffe2/serialize/read_adapter_interface.h"
  14. #include "caffe2/serialize/versions.h"
  15. extern "C" {
  16. typedef struct mz_zip_archive mz_zip_archive;
  17. }
  18. // PyTorch containers are a special zip archive with the following layout
  19. // archive_name.zip contains:
  20. // archive_name/
  21. // version # a file with a single decimal number written in ascii,
  22. // # used to establish the version of the archive format
  23. // model.json # overall model description, this is a json output of
  24. // # ModelDef from torch.proto
  25. // # the following names are by convention only, model.json will
  26. // # refer to these files by full names
  27. // tensors/
  28. // 0 # flat storage for tensor data, meta-data about shapes, etc. is
  29. // # in model.json
  30. // 1
  31. // ...
  32. // # code entries will only exist for modules that have methods attached
  33. // code/
  34. // archive_name.py # serialized torch script code (python syntax, using
  35. // PythonPrint) archive_name_my_submodule.py # submodules have separate
  36. // files
  37. //
  38. // The PyTorchStreamWriter also ensures additional useful properties for these
  39. // files
  40. // 1. All files are stored uncompressed.
  41. // 2. All files in the archive are aligned to 64 byte boundaries such that
  42. // it is possible to mmap the entire file and get an aligned pointer to
  43. // tensor data.
  44. // 3. We universally write in ZIP64 format for consistency.
  45. // The PyTorchStreamReader also provides additional properties:
  46. // 1. It can read zip files that are created with common
  47. // zip tools. This means that even though our writer doesn't compress files,
  48. // the reader can still read files that were compressed.
  49. // 2. It provides a getRecordOffset function which returns the offset into the
  50. // raw file where file data lives. If the file was written with
  51. // PyTorchStreamWriter it is guaranteed to be 64 byte aligned.
  52. // PyTorchReader/Writer handle checking the version number on the archive format
  53. // and ensure that all files are written to a archive_name directory so they
  54. // unzip cleanly.
  55. // When developing this format we want to pay particular attention to the
  56. // following use cases:
  57. //
  58. // -- Reading --
  59. // 1) Reading with full random access
  60. // a) Reading with file api's such as fread()
  61. // b) mmaping the file and jumping around the mapped region
  62. // 2) Reading with 1-pass sequential access
  63. // -> A reader will need to build up a data structure of parsed structures
  64. // as it reads
  65. //
  66. // -- Writing --
  67. // 1) Writing with full random access
  68. // 2) Writing with 1-pass sequential access
  69. // -> We must take care not to require updating values that have already
  70. // been written. We place the variable-length index at the end and do
  71. // not put any indicies into the header to fulfill this constraint.
  72. // The model.json, which contains all the metadata information,
  73. // should be written as the last file. One reason is that the size of tensor
  74. // data is usually stable. As long as the shape and type of the tensor do not
  75. // change, the size of the data won't change. On the other sied, the size of the
  76. // serialized model is likely to change, so we store it as the last record, and
  77. // we don't need to move previous records when updating the model data.
  78. // The zip format is sufficiently flexible to handle the above use-case.
  79. // it puts its central directory at the end of the archive and we write
  80. // model.json as the last file when writing after we have accumulated all
  81. // other information.
  82. namespace caffe2 {
  83. namespace serialize {
  84. class TORCH_API PyTorchStreamReader final {
  85. public:
  86. explicit PyTorchStreamReader(const std::string& file_name);
  87. explicit PyTorchStreamReader(std::istream* in);
  88. explicit PyTorchStreamReader(std::shared_ptr<ReadAdapterInterface> in);
  89. // return dataptr, size
  90. std::tuple<at::DataPtr, size_t> getRecord(const std::string& name);
  91. size_t getRecordOffset(const std::string& name);
  92. bool hasRecord(const std::string& name);
  93. std::vector<std::string> getAllRecords();
  94. ~PyTorchStreamReader();
  95. uint64_t version() const {
  96. return version_;
  97. }
  98. void setShouldLoadDebugSymbol(bool should_load_debug_symbol) {
  99. load_debug_symbol_ = should_load_debug_symbol;
  100. }
  101. private:
  102. void init();
  103. size_t read(uint64_t pos, char* buf, size_t n);
  104. void valid(const char* what, const char* info = "");
  105. size_t getRecordID(const std::string& name);
  106. friend size_t
  107. istream_read_func(void* pOpaque, uint64_t file_ofs, void* pBuf, size_t n);
  108. std::unique_ptr<mz_zip_archive> ar_;
  109. std::string archive_name_;
  110. std::string archive_name_plus_slash_;
  111. std::shared_ptr<ReadAdapterInterface> in_;
  112. int64_t version_;
  113. std::mutex reader_lock_;
  114. bool load_debug_symbol_ = true;
  115. };
  116. class TORCH_API PyTorchStreamWriter final {
  117. public:
  118. explicit PyTorchStreamWriter(std::string archive_name);
  119. explicit PyTorchStreamWriter(
  120. const std::function<size_t(const void*, size_t)> writer_func);
  121. void setMinVersion(const uint64_t version);
  122. void writeRecord(
  123. const std::string& name,
  124. const void* data,
  125. size_t size,
  126. bool compress = false);
  127. void writeEndOfFile();
  128. const std::unordered_set<std::string>& getAllWrittenRecords();
  129. bool finalized() const {
  130. return finalized_;
  131. }
  132. const std::string& archiveName() {
  133. return archive_name_;
  134. }
  135. ~PyTorchStreamWriter();
  136. private:
  137. void setup(const std::string& file_name);
  138. void valid(const char* what, const char* info = "");
  139. size_t current_pos_ = 0;
  140. std::unordered_set<std::string> files_written_;
  141. std::unique_ptr<mz_zip_archive> ar_;
  142. std::string archive_name_;
  143. std::string archive_name_plus_slash_;
  144. std::string padding_;
  145. std::ofstream file_stream_;
  146. std::function<size_t(const void*, size_t)> writer_func_;
  147. // This number will be updated when the model has operators
  148. // that have valid upgraders.
  149. uint64_t version_ = kMinProducedFileFormatVersion;
  150. bool finalized_ = false;
  151. bool err_seen_ = false;
  152. friend size_t ostream_write_func(
  153. void* pOpaque,
  154. uint64_t file_ofs,
  155. const void* pBuf,
  156. size_t n);
  157. };
  158. namespace detail {
  159. // Writer-specific constants
  160. constexpr uint64_t kFieldAlignment = 64;
  161. // Returns a record to be appended to the local user extra data entry in order
  162. // to make data beginning aligned at kFieldAlignment bytes boundary.
  163. size_t getPadding(
  164. size_t cursor,
  165. size_t filename_size,
  166. size_t size,
  167. std::string& padding_buf);
  168. } // namespace detail
  169. } // namespace serialize
  170. } // namespace caffe2