parent
fe18341585
commit
9dc69582de
@ -0,0 +1,58 @@
|
|||||||
|
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
IF(MOBILE_INFERENCE)
|
||||||
|
return()
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
include (ExternalProject)
|
||||||
|
|
||||||
|
# NOTE: snappy is needed when linking with recordio
|
||||||
|
|
||||||
|
SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
|
||||||
|
SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
|
||||||
|
SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
|
||||||
|
|
||||||
|
ExternalProject_Add(
|
||||||
|
extern_snappystream
|
||||||
|
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
|
||||||
|
GIT_TAG "0.2.8"
|
||||||
|
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
|
||||||
|
UPDATE_COMMAND ""
|
||||||
|
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
|
||||||
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||||
|
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
|
||||||
|
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
|
||||||
|
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
|
||||||
|
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
|
||||||
|
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||||
|
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
|
||||||
|
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
|
||||||
|
${EXTERNAL_OPTIONAL_ARGS}
|
||||||
|
CMAKE_CACHE_ARGS
|
||||||
|
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
|
||||||
|
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
|
||||||
|
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
|
||||||
|
BUILD_COMMAND make -j8
|
||||||
|
INSTALL_COMMAND make install
|
||||||
|
DEPENDS snappy
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library(snappystream STATIC IMPORTED GLOBAL)
|
||||||
|
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
|
||||||
|
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
|
||||||
|
|
||||||
|
include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
|
||||||
|
add_dependencies(snappystream extern_snappystream)
|
@ -1,14 +1,6 @@
|
|||||||
# internal library.
|
# internal library.
|
||||||
cc_library(io SRCS io.cc DEPS stringpiece)
|
cc_library(header SRCS header.cc)
|
||||||
cc_test(io_test SRCS io_test.cc DEPS io)
|
|
||||||
cc_library(header SRCS header.cc DEPS io)
|
|
||||||
cc_test(header_test SRCS header_test.cc DEPS header)
|
cc_test(header_test SRCS header_test.cc DEPS header)
|
||||||
cc_library(chunk SRCS chunk.cc DEPS snappy)
|
cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
|
||||||
cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
|
cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
|
||||||
cc_library(range_scanner SRCS range_scanner.cc DEPS io chunk)
|
cc_library(recordio DEPS chunk header)
|
||||||
cc_test(range_scanner_test SRCS range_scanner_test.cc DEPS range_scanner)
|
|
||||||
cc_library(scanner SRCS scanner.cc DEPS range_scanner)
|
|
||||||
cc_test(scanner_test SRCS scanner_test.cc DEPS scanner)
|
|
||||||
# exported library.
|
|
||||||
cc_library(recordio SRCS recordio.cc DEPS scanner chunk header)
|
|
||||||
cc_test(recordio_test SRCS recordio_test.cc DEPS scanner)
|
|
||||||
|
@ -1,33 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
// A wrapper on crc library https://github.com/d-bahr/CRCpp
|
|
||||||
#include <cstdint>
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/detail/crc.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
// usage
|
|
||||||
// char data[] = "hello,world";
|
|
||||||
// crc = Crc32(data, 12);
|
|
||||||
// Assert_EQ(crc, 68a85159);
|
|
||||||
|
|
||||||
uint32_t Crc32(const char* data, size_t size) {
|
|
||||||
return CRC::Calculate(data, size, CRC::CRC_32())
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
File diff suppressed because it is too large
Load Diff
@ -1,55 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/io.h"
|
|
||||||
#include "paddle/fluid/string/piece.h"
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
Stream* Stream::Open(const char* filename, const char* mode) {
|
|
||||||
// Create IOStream for different filesystems
|
|
||||||
// HDFS: hdfs://tmp/file.txt
|
|
||||||
// Default: /tmp/file.txt
|
|
||||||
FILE* fp = nullptr;
|
|
||||||
if (string::HasPrefix(string::Piece(filename), string::Piece("/"))) {
|
|
||||||
fp = fopen(filename, mode);
|
|
||||||
}
|
|
||||||
return new FileStream(fp);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t FileStream::Read(void* ptr, size_t size) {
|
|
||||||
return fread(ptr, 1, size, fp_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void FileStream::Write(const void* ptr, size_t size) {
|
|
||||||
size_t real = fwrite(ptr, 1, size, fp_);
|
|
||||||
PADDLE_ENFORCE(real == size, "FileStream write incomplete.");
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t FileStream::Tell() { return ftell(fp_); }
|
|
||||||
void FileStream::Seek(size_t p) { fseek(fp_, p, SEEK_SET); }
|
|
||||||
|
|
||||||
bool FileStream::Eof() { return feof(fp_); }
|
|
||||||
|
|
||||||
void FileStream::Close() {
|
|
||||||
if (fp_ != nullptr) {
|
|
||||||
fclose(fp_);
|
|
||||||
fp_ = nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,56 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "paddle/fluid/platform/enforce.h"
|
|
||||||
#include "paddle/fluid/platform/macros.h" // DISABLE_COPY_ASSIGN
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
// Seekable Stream Interface for read and write
|
|
||||||
class Stream {
|
|
||||||
public:
|
|
||||||
virtual ~Stream() {}
|
|
||||||
virtual size_t Read(void* ptr, size_t size) = 0;
|
|
||||||
virtual void Write(const void* ptr, size_t size) = 0;
|
|
||||||
virtual size_t Tell() = 0;
|
|
||||||
virtual void Seek(size_t p) = 0;
|
|
||||||
// Create Stream Instance
|
|
||||||
static Stream* Open(const char* filename, const char* mode);
|
|
||||||
};
|
|
||||||
|
|
||||||
// FileStream
|
|
||||||
class FileStream : public Stream {
|
|
||||||
public:
|
|
||||||
explicit FileStream(FILE* fp) : fp_(fp) {}
|
|
||||||
~FileStream() { this->Close(); }
|
|
||||||
size_t Read(void* ptr, size_t size);
|
|
||||||
void Write(const void* ptr, size_t size);
|
|
||||||
size_t Tell();
|
|
||||||
void Seek(size_t p);
|
|
||||||
bool Eof();
|
|
||||||
void Close();
|
|
||||||
|
|
||||||
private:
|
|
||||||
FILE* fp_;
|
|
||||||
DISABLE_COPY_AND_ASSIGN(FileStream);
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,36 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/io.h"
|
|
||||||
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
|
|
||||||
using namespace paddle::recordio;
|
|
||||||
|
|
||||||
TEST(FileStream, IO) {
|
|
||||||
{
|
|
||||||
// Write
|
|
||||||
Stream* fs = Stream::Open("/tmp/record_0", "w");
|
|
||||||
fs->Write("hello", 6);
|
|
||||||
delete fs;
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// Read
|
|
||||||
Stream* fs = Stream::Open("/tmp/record_0", "r+");
|
|
||||||
char buf[10];
|
|
||||||
fs->Read(&buf, 6);
|
|
||||||
EXPECT_STREQ(buf, "hello");
|
|
||||||
delete fs;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,85 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/range_scanner.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
void Index::LoadIndex(FileStream* fi) {
|
|
||||||
int64_t offset = 0;
|
|
||||||
while (!fi->Eof()) {
|
|
||||||
Header hdr;
|
|
||||||
hdr.Parse(fi);
|
|
||||||
chunk_offsets_.push_back(offset);
|
|
||||||
chunk_lens_.push_back(hdr.NumRecords());
|
|
||||||
chunk_records_.push_back(hdr.NumRecords());
|
|
||||||
num_records_ += hdr.NumRecords();
|
|
||||||
offset += hdr.CompressSize();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Index Index::ChunkIndex(int i) { Index idx; }
|
|
||||||
|
|
||||||
std::pair<int, int> Index::Locate(int record_idx) {
|
|
||||||
std::pair<int, int> range(-1, -1);
|
|
||||||
int sum = 0;
|
|
||||||
for (size_t i = 0; i < chunk_lens_.size(); ++i) {
|
|
||||||
int len = static_cast<int>(chunk_lens_[i]);
|
|
||||||
sum += len;
|
|
||||||
if (record_idx < sum) {
|
|
||||||
range.first = static_cast<int>(i);
|
|
||||||
range.second = record_idx - sum + len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return range;
|
|
||||||
}
|
|
||||||
|
|
||||||
RangeScanner::RangeScanner(Stream* fi, Index idx, int start, int len)
|
|
||||||
: stream_(fi), index_(idx) {
|
|
||||||
if (start < 0) {
|
|
||||||
start = 0;
|
|
||||||
}
|
|
||||||
if (len < 0 || start + len >= idx.NumRecords()) {
|
|
||||||
len = idx.NumRecords() - start;
|
|
||||||
}
|
|
||||||
|
|
||||||
start_ = start;
|
|
||||||
end_ = start + len;
|
|
||||||
cur_ = start - 1; // The intial status required by Scan
|
|
||||||
chunk_index_ = -1;
|
|
||||||
chunk_.reset(new Chunk);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool RangeScanner::Scan() {
|
|
||||||
++cur_;
|
|
||||||
if (cur_ >= end_) {
|
|
||||||
return false;
|
|
||||||
} else {
|
|
||||||
auto cursor = index_.Locate(cur_);
|
|
||||||
if (chunk_index_ != cursor.first) {
|
|
||||||
chunk_index_ = cursor.first;
|
|
||||||
chunk_->Parse(fi, index_.ChunkOffsets[chunk_index_]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string RangeScanner::Record() {
|
|
||||||
auto cursor = index_.Locate(cur_);
|
|
||||||
return chunk_->Record(cursor.second);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,81 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/chunk.h"
|
|
||||||
#include "paddle/fluid/recordio/io.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
// Index consists offsets and sizes of the consequetive chunks in a RecordIO
|
|
||||||
// file.
|
|
||||||
//
|
|
||||||
// Index supports Gob. Every field in the Index needs to be exported
|
|
||||||
// for the correct encoding and decoding using Gob.
|
|
||||||
class Index {
|
|
||||||
public:
|
|
||||||
Index() : num_records_(0) {}
|
|
||||||
// LoadIndex scans the file and parse chunkOffsets, chunkLens, and len.
|
|
||||||
void LoadIndex(Stream* fi);
|
|
||||||
// NumRecords returns the total number of all records in a RecordIO file.
|
|
||||||
int NumRecords() { return num_records_; }
|
|
||||||
// NumChunks returns the total number of chunks in a RecordIO file.
|
|
||||||
int NumChunks() { return chunk_lens_.size(); }
|
|
||||||
// ChunkIndex return the Index of i-th Chunk.
|
|
||||||
int ChunkIndex(int i);
|
|
||||||
|
|
||||||
int64_t ChunkOffsets(int i) { return chunk_offsets_[i]; }
|
|
||||||
|
|
||||||
// Locate returns the index of chunk that contains the given record,
|
|
||||||
// and the record index within the chunk. It returns (-1, -1) if the
|
|
||||||
// record is out of range.
|
|
||||||
std::pair<int, int> Locate(int record_idx);
|
|
||||||
|
|
||||||
private:
|
|
||||||
// the offset of each chunk in a file.
|
|
||||||
std::vector<int64_t> chunk_offsets_;
|
|
||||||
// the length of each chunk in a file.
|
|
||||||
std::vector<uint32_t> chunk_lens_;
|
|
||||||
// the numer of all records in a file.
|
|
||||||
int num_records_;
|
|
||||||
// the number of records in chunks.
|
|
||||||
std::vector<int> chunk_records_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// RangeScanner
|
|
||||||
class RangeScanner {
|
|
||||||
public:
|
|
||||||
// creates a scanner that sequencially reads records in the
|
|
||||||
// range [start, start+len). If start < 0, it scans from the
|
|
||||||
// beginning. If len < 0, it scans till the end of file.
|
|
||||||
RangeScanner(Stream* fi, Index idx, int start, int end);
|
|
||||||
// Scan moves the cursor forward for one record and loads the chunk
|
|
||||||
// containing the record if not yet.
|
|
||||||
bool Scan();
|
|
||||||
const std::string Record();
|
|
||||||
|
|
||||||
private:
|
|
||||||
Stream* fi;
|
|
||||||
Index index_;
|
|
||||||
int start_, end_, cur_;
|
|
||||||
int chunk_index_;
|
|
||||||
std::unique_ptr<Chunk> chunk_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,23 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/range_scanner.h"
|
|
||||||
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
|
|
||||||
using namespace paddle::recordio;
|
|
||||||
|
|
||||||
TEST(RangeScanner, Recordio) {
|
|
||||||
Stream* fo = Stream::Open("/tmp/record_range", "w");
|
|
||||||
}
|
|
@ -1,20 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/io.h"
|
|
||||||
#include "paddle/fluid/string/piece.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,20 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/chunk.h"
|
|
||||||
#include "paddle/fluid/recordio/header.h"
|
|
||||||
#include "paddle/fluid/recordio/io.h"
|
|
||||||
#include "paddle/fluid/recordio/scanner.h"
|
|
||||||
#include "paddle/fluid/recordio/writer.h"
|
|
@ -1,68 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/chunk.h"
|
|
||||||
|
|
||||||
#include <glob.h> // glob
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
Scanner::Scanner(const char* paths)
|
|
||||||
: cur_file_(nullptr), path_idx_(0), end_(false) {
|
|
||||||
glob_t glob_result;
|
|
||||||
glob(paths, GLOB_TILDE, NULL, &glob_result);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
|
|
||||||
paths_.emplace_back(std::string(glob_result.gl_pathv[i]));
|
|
||||||
}
|
|
||||||
globfree(&glob_result);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Scanner::Scan() {
|
|
||||||
if (end_ == true) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (cur_scanner_ == nullptr) {
|
|
||||||
if (!NextFile()) {
|
|
||||||
end_ = true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!cur_scanner_->Scan()) {
|
|
||||||
end_ = true;
|
|
||||||
cur_file_ = nullptr;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Scanner::NextFile() {
|
|
||||||
if (path_idx_ >= paths_.size()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
std::string path = paths_[path_idx_];
|
|
||||||
++path_idx_;
|
|
||||||
cur_file_ = Stream::Open(path);
|
|
||||||
if (cur_file_ == nullptr) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
Index idx;
|
|
||||||
idx.LoadIndex(cur_file_);
|
|
||||||
cur_scanner_ = RangeScanner(cur_file_, idx, 0, -1);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,44 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/io.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
class RangeScanner;
|
|
||||||
|
|
||||||
// Scanner is a scanner for multiple recordio files.
|
|
||||||
class Scanner {
|
|
||||||
public:
|
|
||||||
Scanner(const char* paths);
|
|
||||||
const std::string Record();
|
|
||||||
bool Scan();
|
|
||||||
void Close();
|
|
||||||
bool NextFile();
|
|
||||||
int Err() { return err_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::vector<std::string> paths_;
|
|
||||||
Stream* cur_file_;
|
|
||||||
RangeScanner* cur_scanner_;
|
|
||||||
int path_idx_;
|
|
||||||
bool end_;
|
|
||||||
int err_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,21 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/scanner.h"
|
|
||||||
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
|
|
||||||
using namespace paddle::recordio;
|
|
||||||
|
|
||||||
TEST(Scanner, Normal) { Scanner s("/tmp/record_*"); }
|
|
@ -1,53 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/writer.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
Writer::Writer(Stream* fo) : stream_(fo), max_chunk_size_(0), compressor_(0) {}
|
|
||||||
|
|
||||||
Writer::Writer(Stream* fo, int maxChunkSize, int compressor)
|
|
||||||
: stream_(fo),
|
|
||||||
max_chunk_size_(maxChunkSize),
|
|
||||||
compressor_(static_cast<Compressor>(compressor)) {
|
|
||||||
chunk_.reset(new Chunk);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t Writer::Write(const char* buf, size_t length) {
|
|
||||||
if (stream_ == nullptr) {
|
|
||||||
LOG(WARNING) << "Cannot write since writer had been closed.";
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if ((length + chunk_->NumBytes()) > max_chunk_size_) {
|
|
||||||
chunk_->Dump(stream_, compressor_);
|
|
||||||
}
|
|
||||||
chunk_->Add(buf, length);
|
|
||||||
return length;
|
|
||||||
}
|
|
||||||
|
|
||||||
// size_t Writer::Write(const char* buf, size_t length) {
|
|
||||||
// return Write(std::string(buf, length));
|
|
||||||
// }
|
|
||||||
|
|
||||||
// size_t Writer::Write(std::string&& buf) {}
|
|
||||||
|
|
||||||
void Writer::Close() {
|
|
||||||
chunk_->Dump(stream_, compressor_);
|
|
||||||
stream_ = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,50 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include <memory>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/header.h"
|
|
||||||
#include "paddle/fluid/recordio/io.h"
|
|
||||||
|
|
||||||
namespace paddle {
|
|
||||||
namespace recordio {
|
|
||||||
|
|
||||||
// Writer creates a RecordIO file.
|
|
||||||
class Writer {
|
|
||||||
public:
|
|
||||||
Writer(Stream* fo);
|
|
||||||
Writer(Stream* fo, int maxChunkSize, int c);
|
|
||||||
|
|
||||||
// Writes a record. It returns an error if Close has been called.
|
|
||||||
size_t Write(const char* buf, size_t length);
|
|
||||||
|
|
||||||
// Close flushes the current chunk and makes the writer invalid.
|
|
||||||
void Close();
|
|
||||||
|
|
||||||
private:
|
|
||||||
// Set nullptr to mark a closed writer
|
|
||||||
Stream* stream_;
|
|
||||||
// Chunk for store object
|
|
||||||
std::unique_ptr<Chunk> chunk_;
|
|
||||||
// total records size, excluding metadata, before compression.
|
|
||||||
int max_chunk_size_;
|
|
||||||
// Compressor used for chuck
|
|
||||||
Compressor compressor_;
|
|
||||||
DISABLE_COPY_AND_ASSIGN(Writer);
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace recordio
|
|
||||||
} // namespace paddle
|
|
@ -1,29 +0,0 @@
|
|||||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
#include "paddle/fluid/recordio/writer.h"
|
|
||||||
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
|
|
||||||
using namespace paddle::recordio;
|
|
||||||
|
|
||||||
TEST(Writer, Normal) {
|
|
||||||
Stream* fs = Stream::Open("/tmp/record_21", "w");
|
|
||||||
Writer w(fs);
|
|
||||||
w.Write("123", 4);
|
|
||||||
|
|
||||||
// test exception
|
|
||||||
w.Close();
|
|
||||||
EXPECT_ANY_THROW(w.Write("123", 4));
|
|
||||||
}
|
|
Loading…
Reference in new issue