|
|
|
@ -141,40 +141,42 @@ class MultiFileReader : public Reader {
|
|
|
|
|
|
|
|
|
|
void MonitorThread(std::vector<ReaderThreadStatus>* thread_status,
|
|
|
|
|
std::shared_ptr<LoDTensorBlockingQueue> queue) {
|
|
|
|
|
VLOG(30) << "monitor thread in";
|
|
|
|
|
VLOG(3) << "monitor thread in";
|
|
|
|
|
bool reader_thread_is_running = true;
|
|
|
|
|
while (reader_thread_is_running) {
|
|
|
|
|
VLOG(30) << "reader_thread_is_running";
|
|
|
|
|
VLOG(3) << "reader_thread_is_running";
|
|
|
|
|
reader_thread_is_running = false;
|
|
|
|
|
for (size_t i = 0; i < (*thread_status).size(); ++i) {
|
|
|
|
|
if ((*thread_status)[i] == Running) {
|
|
|
|
|
VLOG(30) << "reader is running!";
|
|
|
|
|
VLOG(3) << "reader is running!";
|
|
|
|
|
reader_thread_is_running = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
|
|
|
|
}
|
|
|
|
|
VLOG(30) << "all reader thread is stopped, push empty data into queue";
|
|
|
|
|
VLOG(3) << "all reader thread is stopped, push empty data into queue";
|
|
|
|
|
queue->Push({});
|
|
|
|
|
VLOG(30) << "monitor thread exited";
|
|
|
|
|
VLOG(3) << "monitor thread exited";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ReadThread(const std::vector<std::string>& file_list,
|
|
|
|
|
const std::vector<std::string>& slots, int batch_size,
|
|
|
|
|
const std::string& file_type, const std::string& file_format,
|
|
|
|
|
const std::vector<std::string>& dense_slots,
|
|
|
|
|
const std::vector<std::string>& sparse_slots, int batch_size,
|
|
|
|
|
int thread_id, std::vector<ReaderThreadStatus>* thread_status,
|
|
|
|
|
std::shared_ptr<LoDTensorBlockingQueue> queue) {
|
|
|
|
|
VLOG(30) << "[" << thread_id << "]"
|
|
|
|
|
<< " reader thread start! thread_id = " << thread_id;
|
|
|
|
|
VLOG(3) << "[" << thread_id << "]"
|
|
|
|
|
<< " reader thread start! thread_id = " << thread_id;
|
|
|
|
|
for (auto& file : file_list) {
|
|
|
|
|
VLOG(30) << "[" << thread_id << "]"
|
|
|
|
|
<< " file " << file;
|
|
|
|
|
VLOG(3) << "[" << thread_id << "]"
|
|
|
|
|
<< " file " << file;
|
|
|
|
|
}
|
|
|
|
|
(*thread_status)[thread_id] = Running;
|
|
|
|
|
VLOG(30) << "set status to running";
|
|
|
|
|
VLOG(3) << "set status to running";
|
|
|
|
|
|
|
|
|
|
std::unordered_map<std::string, size_t> slot_to_index;
|
|
|
|
|
for (size_t i = 0; i < slots.size(); ++i) {
|
|
|
|
|
slot_to_index[slots[i]] = i;
|
|
|
|
|
for (size_t i = 0; i < sparse_slots.size(); ++i) {
|
|
|
|
|
slot_to_index[sparse_slots[i]] = i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string line;
|
|
|
|
@ -182,11 +184,18 @@ void ReadThread(const std::vector<std::string>& file_list,
|
|
|
|
|
std::vector<std::unordered_map<std::string, std::vector<int64_t>>> batch_data;
|
|
|
|
|
std::vector<int64_t> batch_label;
|
|
|
|
|
|
|
|
|
|
MultiFileReader<GzipReader> reader(file_list);
|
|
|
|
|
std::unique_ptr<Reader> reader;
|
|
|
|
|
if (file_type == "gzip") {
|
|
|
|
|
reader.reset(new MultiFileReader<GzipReader>(file_list));
|
|
|
|
|
} else if (file_type == "plain") {
|
|
|
|
|
reader.reset(new MultiFileReader<PlainFileReader>(file_list));
|
|
|
|
|
} else {
|
|
|
|
|
PADDLE_THROW("do not support file format %s", file_type);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
VLOG(30) << "reader inited";
|
|
|
|
|
VLOG(3) << "reader inited";
|
|
|
|
|
|
|
|
|
|
while (reader.HasNext()) {
|
|
|
|
|
while (reader->HasNext()) {
|
|
|
|
|
batch_data.clear();
|
|
|
|
|
batch_data.reserve(batch_size);
|
|
|
|
|
|
|
|
|
@ -195,8 +204,8 @@ void ReadThread(const std::vector<std::string>& file_list,
|
|
|
|
|
|
|
|
|
|
// read batch_size data
|
|
|
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
|
|
|
if (reader.HasNext()) {
|
|
|
|
|
reader.NextLine(&line);
|
|
|
|
|
if (reader->HasNext()) {
|
|
|
|
|
reader->NextLine(&line);
|
|
|
|
|
std::unordered_map<std::string, std::vector<int64_t>> slot_to_data;
|
|
|
|
|
int64_t label;
|
|
|
|
|
parse_line(line, slot_to_index, &label, &slot_to_data);
|
|
|
|
@ -209,8 +218,8 @@ void ReadThread(const std::vector<std::string>& file_list,
|
|
|
|
|
|
|
|
|
|
std::vector<framework::LoDTensor> lod_datas;
|
|
|
|
|
|
|
|
|
|
// first insert tensor for each slots
|
|
|
|
|
for (auto& slot : slots) {
|
|
|
|
|
// first insert tensor for each sparse_slots
|
|
|
|
|
for (auto& slot : sparse_slots) {
|
|
|
|
|
std::vector<size_t> lod_data{0};
|
|
|
|
|
std::vector<int64_t> batch_feasign;
|
|
|
|
|
|
|
|
|
@ -242,11 +251,11 @@ void ReadThread(const std::vector<std::string>& file_list,
|
|
|
|
|
lod_datas.push_back(label_tensor);
|
|
|
|
|
|
|
|
|
|
queue->Push(lod_datas);
|
|
|
|
|
VLOG(40) << "push one data, queue_size=" << queue->Size();
|
|
|
|
|
VLOG(4) << "push one data, queue_size=" << queue->Size();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
(*thread_status)[thread_id] = Stopped;
|
|
|
|
|
VLOG(30) << "set status to stopped, thread " << thread_id << " exited";
|
|
|
|
|
VLOG(3) << "set status to stopped, thread " << thread_id << " exited";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace reader
|
|
|
|
|