parent
fe18341585
commit
9dc69582de
@ -0,0 +1,58 @@
|
||||
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
IF(MOBILE_INFERENCE)
|
||||
return()
|
||||
ENDIF()
|
||||
|
||||
include (ExternalProject)
|
||||
|
||||
# NOTE: snappy is needed when linking with recordio
|
||||
|
||||
SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
|
||||
SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
|
||||
SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
|
||||
|
||||
ExternalProject_Add(
|
||||
extern_snappystream
|
||||
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
|
||||
GIT_TAG "0.2.8"
|
||||
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
|
||||
UPDATE_COMMAND ""
|
||||
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
|
||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
|
||||
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
|
||||
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
|
||||
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
|
||||
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
|
||||
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
|
||||
${EXTERNAL_OPTIONAL_ARGS}
|
||||
CMAKE_CACHE_ARGS
|
||||
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
|
||||
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
|
||||
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
|
||||
BUILD_COMMAND make -j8
|
||||
INSTALL_COMMAND make install
|
||||
DEPENDS snappy
|
||||
)
|
||||
|
||||
add_library(snappystream STATIC IMPORTED GLOBAL)
|
||||
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
|
||||
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
|
||||
|
||||
include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
|
||||
add_dependencies(snappystream extern_snappystream)
|
@ -1,14 +1,6 @@
|
||||
# internal library.
|
||||
cc_library(io SRCS io.cc DEPS stringpiece)
|
||||
cc_test(io_test SRCS io_test.cc DEPS io)
|
||||
cc_library(header SRCS header.cc DEPS io)
|
||||
cc_library(header SRCS header.cc)
|
||||
cc_test(header_test SRCS header_test.cc DEPS header)
|
||||
cc_library(chunk SRCS chunk.cc DEPS snappy)
|
||||
cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
|
||||
cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
|
||||
cc_library(range_scanner SRCS range_scanner.cc DEPS io chunk)
|
||||
cc_test(range_scanner_test SRCS range_scanner_test.cc DEPS range_scanner)
|
||||
cc_library(scanner SRCS scanner.cc DEPS range_scanner)
|
||||
cc_test(scanner_test SRCS scanner_test.cc DEPS scanner)
|
||||
# exported library.
|
||||
cc_library(recordio SRCS recordio.cc DEPS scanner chunk header)
|
||||
cc_test(recordio_test SRCS recordio_test.cc DEPS scanner)
|
||||
cc_library(recordio DEPS chunk header)
|
||||
|
@ -1,33 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// A wrapper on crc library https://github.com/d-bahr/CRCpp
|
||||
#include <cstdint>
|
||||
|
||||
#include "paddle/fluid/recordio/detail/crc.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
// usage
|
||||
// char data[] = "hello,world";
|
||||
// crc = Crc32(data, 12);
|
||||
// Assert_EQ(crc, 68a85159);
|
||||
|
||||
uint32_t Crc32(const char* data, size_t size) {
|
||||
return CRC::Calculate(data, size, CRC::CRC_32())
|
||||
}
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
File diff suppressed because it is too large
Load Diff
@ -1,55 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/io.h"
|
||||
#include "paddle/fluid/string/piece.h"
|
||||
|
||||
#include <iostream>
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
Stream* Stream::Open(const char* filename, const char* mode) {
|
||||
// Create IOStream for different filesystems
|
||||
// HDFS: hdfs://tmp/file.txt
|
||||
// Default: /tmp/file.txt
|
||||
FILE* fp = nullptr;
|
||||
if (string::HasPrefix(string::Piece(filename), string::Piece("/"))) {
|
||||
fp = fopen(filename, mode);
|
||||
}
|
||||
return new FileStream(fp);
|
||||
}
|
||||
|
||||
size_t FileStream::Read(void* ptr, size_t size) {
|
||||
return fread(ptr, 1, size, fp_);
|
||||
}
|
||||
|
||||
void FileStream::Write(const void* ptr, size_t size) {
|
||||
size_t real = fwrite(ptr, 1, size, fp_);
|
||||
PADDLE_ENFORCE(real == size, "FileStream write incomplete.");
|
||||
}
|
||||
|
||||
size_t FileStream::Tell() { return ftell(fp_); }
|
||||
void FileStream::Seek(size_t p) { fseek(fp_, p, SEEK_SET); }
|
||||
|
||||
bool FileStream::Eof() { return feof(fp_); }
|
||||
|
||||
void FileStream::Close() {
|
||||
if (fp_ != nullptr) {
|
||||
fclose(fp_);
|
||||
fp_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,56 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
|
||||
#include "paddle/fluid/platform/enforce.h"
|
||||
#include "paddle/fluid/platform/macros.h" // DISABLE_COPY_ASSIGN
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
// Seekable Stream Interface for read and write
|
||||
class Stream {
|
||||
public:
|
||||
virtual ~Stream() {}
|
||||
virtual size_t Read(void* ptr, size_t size) = 0;
|
||||
virtual void Write(const void* ptr, size_t size) = 0;
|
||||
virtual size_t Tell() = 0;
|
||||
virtual void Seek(size_t p) = 0;
|
||||
// Create Stream Instance
|
||||
static Stream* Open(const char* filename, const char* mode);
|
||||
};
|
||||
|
||||
// FileStream
|
||||
class FileStream : public Stream {
|
||||
public:
|
||||
explicit FileStream(FILE* fp) : fp_(fp) {}
|
||||
~FileStream() { this->Close(); }
|
||||
size_t Read(void* ptr, size_t size);
|
||||
void Write(const void* ptr, size_t size);
|
||||
size_t Tell();
|
||||
void Seek(size_t p);
|
||||
bool Eof();
|
||||
void Close();
|
||||
|
||||
private:
|
||||
FILE* fp_;
|
||||
DISABLE_COPY_AND_ASSIGN(FileStream);
|
||||
};
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,36 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/io.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace paddle::recordio;
|
||||
|
||||
TEST(FileStream, IO) {
|
||||
{
|
||||
// Write
|
||||
Stream* fs = Stream::Open("/tmp/record_0", "w");
|
||||
fs->Write("hello", 6);
|
||||
delete fs;
|
||||
}
|
||||
{
|
||||
// Read
|
||||
Stream* fs = Stream::Open("/tmp/record_0", "r+");
|
||||
char buf[10];
|
||||
fs->Read(&buf, 6);
|
||||
EXPECT_STREQ(buf, "hello");
|
||||
delete fs;
|
||||
}
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/range_scanner.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
void Index::LoadIndex(FileStream* fi) {
|
||||
int64_t offset = 0;
|
||||
while (!fi->Eof()) {
|
||||
Header hdr;
|
||||
hdr.Parse(fi);
|
||||
chunk_offsets_.push_back(offset);
|
||||
chunk_lens_.push_back(hdr.NumRecords());
|
||||
chunk_records_.push_back(hdr.NumRecords());
|
||||
num_records_ += hdr.NumRecords();
|
||||
offset += hdr.CompressSize();
|
||||
}
|
||||
}
|
||||
|
||||
Index Index::ChunkIndex(int i) { Index idx; }
|
||||
|
||||
std::pair<int, int> Index::Locate(int record_idx) {
|
||||
std::pair<int, int> range(-1, -1);
|
||||
int sum = 0;
|
||||
for (size_t i = 0; i < chunk_lens_.size(); ++i) {
|
||||
int len = static_cast<int>(chunk_lens_[i]);
|
||||
sum += len;
|
||||
if (record_idx < sum) {
|
||||
range.first = static_cast<int>(i);
|
||||
range.second = record_idx - sum + len;
|
||||
}
|
||||
}
|
||||
return range;
|
||||
}
|
||||
|
||||
RangeScanner::RangeScanner(Stream* fi, Index idx, int start, int len)
|
||||
: stream_(fi), index_(idx) {
|
||||
if (start < 0) {
|
||||
start = 0;
|
||||
}
|
||||
if (len < 0 || start + len >= idx.NumRecords()) {
|
||||
len = idx.NumRecords() - start;
|
||||
}
|
||||
|
||||
start_ = start;
|
||||
end_ = start + len;
|
||||
cur_ = start - 1; // The intial status required by Scan
|
||||
chunk_index_ = -1;
|
||||
chunk_.reset(new Chunk);
|
||||
}
|
||||
|
||||
bool RangeScanner::Scan() {
|
||||
++cur_;
|
||||
if (cur_ >= end_) {
|
||||
return false;
|
||||
} else {
|
||||
auto cursor = index_.Locate(cur_);
|
||||
if (chunk_index_ != cursor.first) {
|
||||
chunk_index_ = cursor.first;
|
||||
chunk_->Parse(fi, index_.ChunkOffsets[chunk_index_]);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::string RangeScanner::Record() {
|
||||
auto cursor = index_.Locate(cur_);
|
||||
return chunk_->Record(cursor.second);
|
||||
}
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,81 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "paddle/fluid/recordio/chunk.h"
|
||||
#include "paddle/fluid/recordio/io.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
// Index consists offsets and sizes of the consequetive chunks in a RecordIO
|
||||
// file.
|
||||
//
|
||||
// Index supports Gob. Every field in the Index needs to be exported
|
||||
// for the correct encoding and decoding using Gob.
|
||||
class Index {
|
||||
public:
|
||||
Index() : num_records_(0) {}
|
||||
// LoadIndex scans the file and parse chunkOffsets, chunkLens, and len.
|
||||
void LoadIndex(Stream* fi);
|
||||
// NumRecords returns the total number of all records in a RecordIO file.
|
||||
int NumRecords() { return num_records_; }
|
||||
// NumChunks returns the total number of chunks in a RecordIO file.
|
||||
int NumChunks() { return chunk_lens_.size(); }
|
||||
// ChunkIndex return the Index of i-th Chunk.
|
||||
int ChunkIndex(int i);
|
||||
|
||||
int64_t ChunkOffsets(int i) { return chunk_offsets_[i]; }
|
||||
|
||||
// Locate returns the index of chunk that contains the given record,
|
||||
// and the record index within the chunk. It returns (-1, -1) if the
|
||||
// record is out of range.
|
||||
std::pair<int, int> Locate(int record_idx);
|
||||
|
||||
private:
|
||||
// the offset of each chunk in a file.
|
||||
std::vector<int64_t> chunk_offsets_;
|
||||
// the length of each chunk in a file.
|
||||
std::vector<uint32_t> chunk_lens_;
|
||||
// the numer of all records in a file.
|
||||
int num_records_;
|
||||
// the number of records in chunks.
|
||||
std::vector<int> chunk_records_;
|
||||
};
|
||||
|
||||
// RangeScanner
|
||||
class RangeScanner {
|
||||
public:
|
||||
// creates a scanner that sequencially reads records in the
|
||||
// range [start, start+len). If start < 0, it scans from the
|
||||
// beginning. If len < 0, it scans till the end of file.
|
||||
RangeScanner(Stream* fi, Index idx, int start, int end);
|
||||
// Scan moves the cursor forward for one record and loads the chunk
|
||||
// containing the record if not yet.
|
||||
bool Scan();
|
||||
const std::string Record();
|
||||
|
||||
private:
|
||||
Stream* fi;
|
||||
Index index_;
|
||||
int start_, end_, cur_;
|
||||
int chunk_index_;
|
||||
std::unique_ptr<Chunk> chunk_;
|
||||
};
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,23 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/range_scanner.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace paddle::recordio;
|
||||
|
||||
TEST(RangeScanner, Recordio) {
|
||||
Stream* fo = Stream::Open("/tmp/record_range", "w");
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/io.h"
|
||||
#include "paddle/fluid/string/piece.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,20 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
#pragma once
|
||||
|
||||
#include "paddle/fluid/recordio/chunk.h"
|
||||
#include "paddle/fluid/recordio/header.h"
|
||||
#include "paddle/fluid/recordio/io.h"
|
||||
#include "paddle/fluid/recordio/scanner.h"
|
||||
#include "paddle/fluid/recordio/writer.h"
|
@ -1,68 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/chunk.h"
|
||||
|
||||
#include <glob.h> // glob
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
Scanner::Scanner(const char* paths)
|
||||
: cur_file_(nullptr), path_idx_(0), end_(false) {
|
||||
glob_t glob_result;
|
||||
glob(paths, GLOB_TILDE, NULL, &glob_result);
|
||||
|
||||
for (size_t i = 0; i < glob_result.gl_pathc; ++i) {
|
||||
paths_.emplace_back(std::string(glob_result.gl_pathv[i]));
|
||||
}
|
||||
globfree(&glob_result);
|
||||
}
|
||||
|
||||
bool Scanner::Scan() {
|
||||
if (end_ == true) {
|
||||
return false;
|
||||
}
|
||||
if (cur_scanner_ == nullptr) {
|
||||
if (!NextFile()) {
|
||||
end_ = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!cur_scanner_->Scan()) {
|
||||
end_ = true;
|
||||
cur_file_ = nullptr;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Scanner::NextFile() {
|
||||
if (path_idx_ >= paths_.size()) {
|
||||
return false;
|
||||
}
|
||||
std::string path = paths_[path_idx_];
|
||||
++path_idx_;
|
||||
cur_file_ = Stream::Open(path);
|
||||
if (cur_file_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
Index idx;
|
||||
idx.LoadIndex(cur_file_);
|
||||
cur_scanner_ = RangeScanner(cur_file_, idx, 0, -1);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,44 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "paddle/fluid/recordio/io.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
class RangeScanner;
|
||||
|
||||
// Scanner is a scanner for multiple recordio files.
|
||||
class Scanner {
|
||||
public:
|
||||
Scanner(const char* paths);
|
||||
const std::string Record();
|
||||
bool Scan();
|
||||
void Close();
|
||||
bool NextFile();
|
||||
int Err() { return err_; }
|
||||
|
||||
private:
|
||||
std::vector<std::string> paths_;
|
||||
Stream* cur_file_;
|
||||
RangeScanner* cur_scanner_;
|
||||
int path_idx_;
|
||||
bool end_;
|
||||
int err_;
|
||||
};
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,21 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/scanner.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace paddle::recordio;
|
||||
|
||||
TEST(Scanner, Normal) { Scanner s("/tmp/record_*"); }
|
@ -1,53 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/writer.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
Writer::Writer(Stream* fo) : stream_(fo), max_chunk_size_(0), compressor_(0) {}
|
||||
|
||||
Writer::Writer(Stream* fo, int maxChunkSize, int compressor)
|
||||
: stream_(fo),
|
||||
max_chunk_size_(maxChunkSize),
|
||||
compressor_(static_cast<Compressor>(compressor)) {
|
||||
chunk_.reset(new Chunk);
|
||||
}
|
||||
|
||||
size_t Writer::Write(const char* buf, size_t length) {
|
||||
if (stream_ == nullptr) {
|
||||
LOG(WARNING) << "Cannot write since writer had been closed.";
|
||||
return 0;
|
||||
}
|
||||
if ((length + chunk_->NumBytes()) > max_chunk_size_) {
|
||||
chunk_->Dump(stream_, compressor_);
|
||||
}
|
||||
chunk_->Add(buf, length);
|
||||
return length;
|
||||
}
|
||||
|
||||
// size_t Writer::Write(const char* buf, size_t length) {
|
||||
// return Write(std::string(buf, length));
|
||||
// }
|
||||
|
||||
// size_t Writer::Write(std::string&& buf) {}
|
||||
|
||||
void Writer::Close() {
|
||||
chunk_->Dump(stream_, compressor_);
|
||||
stream_ = nullptr;
|
||||
}
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,50 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "paddle/fluid/recordio/header.h"
|
||||
#include "paddle/fluid/recordio/io.h"
|
||||
|
||||
namespace paddle {
|
||||
namespace recordio {
|
||||
|
||||
// Writer creates a RecordIO file.
|
||||
class Writer {
|
||||
public:
|
||||
Writer(Stream* fo);
|
||||
Writer(Stream* fo, int maxChunkSize, int c);
|
||||
|
||||
// Writes a record. It returns an error if Close has been called.
|
||||
size_t Write(const char* buf, size_t length);
|
||||
|
||||
// Close flushes the current chunk and makes the writer invalid.
|
||||
void Close();
|
||||
|
||||
private:
|
||||
// Set nullptr to mark a closed writer
|
||||
Stream* stream_;
|
||||
// Chunk for store object
|
||||
std::unique_ptr<Chunk> chunk_;
|
||||
// total records size, excluding metadata, before compression.
|
||||
int max_chunk_size_;
|
||||
// Compressor used for chuck
|
||||
Compressor compressor_;
|
||||
DISABLE_COPY_AND_ASSIGN(Writer);
|
||||
};
|
||||
|
||||
} // namespace recordio
|
||||
} // namespace paddle
|
@ -1,29 +0,0 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "paddle/fluid/recordio/writer.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace paddle::recordio;
|
||||
|
||||
TEST(Writer, Normal) {
|
||||
Stream* fs = Stream::Open("/tmp/record_21", "w");
|
||||
Writer w(fs);
|
||||
w.Write("123", 4);
|
||||
|
||||
// test exception
|
||||
w.Close();
|
||||
EXPECT_ANY_THROW(w.Write("123", 4));
|
||||
}
|
Loading…
Reference in new issue