|
|
@ -11,7 +11,7 @@
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
# limitations under the License.
|
|
|
|
"""HDFS Utils"""
|
|
|
|
"""HDFS Utils."""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import sys
|
|
|
@ -84,7 +84,7 @@ class HDFSClient(object):
|
|
|
|
ret_code, ret_out, ret_err = proc.returncode, output, errors
|
|
|
|
ret_code, ret_out, ret_err = proc.returncode, output, errors
|
|
|
|
|
|
|
|
|
|
|
|
_logger.info(
|
|
|
|
_logger.info(
|
|
|
|
'Times: %d, Running command: %s. Return code: %d, Error: %s' %
|
|
|
|
'Times: %d, Running command: %s. Return code: %d, Msg: %s' %
|
|
|
|
(x, whole_commands, proc.returncode, errors))
|
|
|
|
(x, whole_commands, proc.returncode, errors))
|
|
|
|
|
|
|
|
|
|
|
|
if ret_code == 0:
|
|
|
|
if ret_code == 0:
|
|
|
@ -93,6 +93,13 @@ class HDFSClient(object):
|
|
|
|
return ret_code, ret_out, ret_err
|
|
|
|
return ret_code, ret_out, ret_err
|
|
|
|
|
|
|
|
|
|
|
|
def cat(self, hdfs_path=None):
|
|
|
|
def cat(self, hdfs_path=None):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
cat hdfs file
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
hdfs_path(str): the hdfs file path
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
file content
|
|
|
|
|
|
|
|
"""
|
|
|
|
if self.is_file(hdfs_path):
|
|
|
|
if self.is_file(hdfs_path):
|
|
|
|
exist_cmd = ['-cat', hdfs_path]
|
|
|
|
exist_cmd = ['-cat', hdfs_path]
|
|
|
|
returncode, output, errors = self.__run_hdfs_cmd(
|
|
|
|
returncode, output, errors = self.__run_hdfs_cmd(
|
|
|
@ -101,8 +108,7 @@ class HDFSClient(object):
|
|
|
|
_logger.error("HDFS cat HDFS path: {} failed".format(hdfs_path))
|
|
|
|
_logger.error("HDFS cat HDFS path: {} failed".format(hdfs_path))
|
|
|
|
return ""
|
|
|
|
return ""
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
_logger.error("HDFS cat HDFS path: {} succeed".format(
|
|
|
|
_logger.info("HDFS cat HDFS path: {} succeed".format(hdfs_path))
|
|
|
|
hdfs_path))
|
|
|
|
|
|
|
|
return output.strip()
|
|
|
|
return output.strip()
|
|
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
else:
|
|
|
@ -190,7 +196,7 @@ class HDFSClient(object):
|
|
|
|
whether the remote HDFS path exists
|
|
|
|
whether the remote HDFS path exists
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
hdfs_path: HDFS path.
|
|
|
|
hdfs_path(str): HDFS path.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
True or False
|
|
|
|
True or False
|
|
|
@ -224,9 +230,10 @@ class HDFSClient(object):
|
|
|
|
Move a file or folder on HDFS.
|
|
|
|
Move a file or folder on HDFS.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
hdfs_path(str): HDFS path.
|
|
|
|
hdfs_src_path(str): HDFS path
|
|
|
|
overwrite(bool|False): If the path already exists and overwrite is False, will return False.
|
|
|
|
hdfs_dst_path(str): HDFS path
|
|
|
|
|
|
|
|
overwrite(bool|False): If the path already exists and overwrite is
|
|
|
|
|
|
|
|
False, will return False.
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
True or False
|
|
|
|
True or False
|
|
|
|
"""
|
|
|
|
"""
|
|
|
@ -256,8 +263,9 @@ class HDFSClient(object):
|
|
|
|
def make_local_dirs(local_path):
|
|
|
|
def make_local_dirs(local_path):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
create a directiory local, is same to mkdir
|
|
|
|
create a directiory local, is same to mkdir
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
local_path: local path that wants to create a directiory.
|
|
|
|
local_path(str): local path that wants to create a directiory.
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
os.makedirs(local_path)
|
|
|
|
os.makedirs(local_path)
|
|
|
@ -270,7 +278,8 @@ class HDFSClient(object):
|
|
|
|
Create a remote directory, recursively if necessary.
|
|
|
|
Create a remote directory, recursively if necessary.
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
|
|
|
|
hdfs_path(str): Remote path. Intermediate directories will be
|
|
|
|
|
|
|
|
created appropriately.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
True or False
|
|
|
|
True or False
|
|
|
@ -290,7 +299,7 @@ class HDFSClient(object):
|
|
|
|
_logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
|
|
|
|
_logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
_logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
|
|
|
|
_logger.info("HDFS mkdir path: {} successfully".format(hdfs_path))
|
|
|
|
return True
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def ls(self, hdfs_path):
|
|
|
|
def ls(self, hdfs_path):
|
|
|
@ -298,7 +307,7 @@ class HDFSClient(object):
|
|
|
|
ls directory contents about HDFS hdfs_path
|
|
|
|
ls directory contents about HDFS hdfs_path
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
hdfs_path(str): Remote HDFS path will be ls.
|
|
|
|
hdfs_path(str): Remote HDFS path will be ls.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
List: a contents list about hdfs_path.
|
|
|
|
List: a contents list about hdfs_path.
|
|
|
@ -332,9 +341,8 @@ class HDFSClient(object):
|
|
|
|
list directory contents about HDFS hdfs_path recursively
|
|
|
|
list directory contents about HDFS hdfs_path recursively
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
Args:
|
|
|
|
hdfs_path(str): Remote HDFS path.
|
|
|
|
hdfs_path(str): Remote HDFS path.
|
|
|
|
only_file(bool|True): will discard folders.
|
|
|
|
excludes(list): excludes
|
|
|
|
sort(bool|True): will be sorted by create time.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
List: a contents list about hdfs_path.
|
|
|
|
List: a contents list about hdfs_path.
|
|
|
@ -373,7 +381,18 @@ class HDFSClient(object):
|
|
|
|
return ret_lines
|
|
|
|
return ret_lines
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
|
def split_flies(files, trainer_id, trainers):
|
|
|
|
def split_files(files, trainer_id, trainers):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
split file list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
files(list): file list
|
|
|
|
|
|
|
|
trainer_id(int): trainer mpi rank id
|
|
|
|
|
|
|
|
trainers(int): all trainers num
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
fileist(list): file list of current trainer
|
|
|
|
|
|
|
|
"""
|
|
|
|
remainder = len(files) % trainers
|
|
|
|
remainder = len(files) % trainers
|
|
|
|
blocksize = len(files) / trainers
|
|
|
|
blocksize = len(files) / trainers
|
|
|
|
|
|
|
|
|
|
|
@ -402,6 +421,8 @@ class HDFSClient(object):
|
|
|
|
hdfs_path(str): path on hdfs
|
|
|
|
hdfs_path(str): path on hdfs
|
|
|
|
local_path(str): path on local
|
|
|
|
local_path(str): path on local
|
|
|
|
multi_processes(int|5): the download data process at the same time, default=5
|
|
|
|
multi_processes(int|5): the download data process at the same time, default=5
|
|
|
|
|
|
|
|
overwrite(bool): is overwrite
|
|
|
|
|
|
|
|
retry_times(int): retry times
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
List:
|
|
|
|
List:
|
|
|
@ -478,7 +499,7 @@ class HDFSClient(object):
|
|
|
|
local_path(str): path on local
|
|
|
|
local_path(str): path on local
|
|
|
|
multi_processes(int|5): the upload data process at the same time, default=5
|
|
|
|
multi_processes(int|5): the upload data process at the same time, default=5
|
|
|
|
overwrite(bool|False): will overwrite file on HDFS or not
|
|
|
|
overwrite(bool|False): will overwrite file on HDFS or not
|
|
|
|
sync(bool|True): upload files sync or not.
|
|
|
|
retry_times(int): upload file max retry time.
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
Returns:
|
|
|
|
None
|
|
|
|
None
|
|
|
@ -497,6 +518,15 @@ class HDFSClient(object):
|
|
|
|
return True
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def get_local_files(path):
|
|
|
|
def get_local_files(path):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
get local files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
path(str): local path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
list of local files
|
|
|
|
|
|
|
|
"""
|
|
|
|
rlist = []
|
|
|
|
rlist = []
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(path):
|
|
|
|
if not os.path.exists(path):
|
|
|
@ -537,6 +567,32 @@ class HDFSClient(object):
|
|
|
|
_logger.info("Finish upload datas from {} to {}".format(local_path,
|
|
|
|
_logger.info("Finish upload datas from {} to {}".format(local_path,
|
|
|
|
hdfs_path))
|
|
|
|
hdfs_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def upload_dir(self, dest_dir, local_dir, overwrite=False):
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
upload dir to hdfs
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
dest_dir(str): hdfs dest dir
|
|
|
|
|
|
|
|
local_dir(str): hdfs local dir
|
|
|
|
|
|
|
|
overwrite(bool): is overwrite
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
return code
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
local_dir = local_dir.rstrip("/")
|
|
|
|
|
|
|
|
dest_dir = dest_dir.rstrip("/")
|
|
|
|
|
|
|
|
local_basename = os.path.basename(local_dir)
|
|
|
|
|
|
|
|
if self.is_exist(dest_dir + "/" + local_basename) and overwrite:
|
|
|
|
|
|
|
|
self.delete(dest_dir + "/" + local_basename)
|
|
|
|
|
|
|
|
if not self.is_exist(dest_dir):
|
|
|
|
|
|
|
|
self.makedirs(dest_dir)
|
|
|
|
|
|
|
|
put_command = ["-put", local_dir, dest_dir]
|
|
|
|
|
|
|
|
returncode, output, errors = self.__run_hdfs_cmd(put_command,
|
|
|
|
|
|
|
|
retry_times)
|
|
|
|
|
|
|
|
if returncode != 0:
|
|
|
|
|
|
|
|
_logger.error("Put local dir: {} to HDFS dir: {} failed".format(
|
|
|
|
|
|
|
|
local_dir, dest_dir))
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if __name__ == "__main__":
|
|
|
|
hadoop_home = "/home/client/hadoop-client/hadoop/"
|
|
|
|
hadoop_home = "/home/client/hadoop-client/hadoop/"
|
|
|
|