@ -24,6 +24,7 @@ import math
from paddle . dataset . common import download
import tarfile
import StringIO
import argparse
random . seed ( 0 )
np . random . seed ( 0 )
@ -131,7 +132,7 @@ def check_integrity(filename, target_hash):
return False
def convert ( tar_file , output_file ) :
def convert _Imagenet_tar2bin ( tar_file , output_file ) :
print ( ' Converting 50000 images to binary file ... \n ' )
tar = tarfile . open ( name = tar_file , mode = ' r:gz ' )
@ -205,9 +206,90 @@ def run_convert():
" Can not convert the dataset to binary file with try limit {0} " .
format ( try_limit ) )
download_concat ( cache_folder , zip_path )
convert ( zip_path , output_file )
convert _Imagenet_tar2bin ( zip_path , output_file )
print ( " \n Success! The binary file can be found at {0} " . format ( output_file ) )
def convert_Imagenet_local2bin ( args ) :
data_dir = args . data_dir
label_list_path = os . path . join ( args . data_dir , args . label_list )
bin_file_path = os . path . join ( args . data_dir , args . output_file )
assert data_dir , ' Once set --local, user need to provide the --data_dir '
with open ( label_list_path ) as flist :
lines = [ line . strip ( ) for line in flist ]
num_images = len ( lines )
with open ( bin_file_path , " w+b " ) as of :
of . seek ( 0 )
num = np . array ( int ( num_images ) ) . astype ( ' int64 ' )
of . write ( num . tobytes ( ) )
for idx , line in enumerate ( lines ) :
img_path , label = line . split ( )
img_path = os . path . join ( data_dir , img_path )
if not os . path . exists ( img_path ) :
continue
#save image(float32) to file
img = Image . open ( img_path )
img = process_image ( img )
np_img = np . array ( img )
of . seek ( SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
idx )
of . write ( np_img . astype ( ' float32 ' ) . tobytes ( ) )
#save label(int64_t) to file
label_int = ( int ) ( label )
np_label = np . array ( label_int )
of . seek ( SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
num_images + idx * SIZE_INT64 )
of . write ( np_label . astype ( ' int64 ' ) . tobytes ( ) )
# The bin file should contain
# number of images + all images data + all corresponding labels
# so the file target_size should be as follows
target_size = SIZE_INT64 + num_images * 3 * args . data_dim * args . data_dim * SIZE_FLOAT32 + num_images * SIZE_INT64
if ( os . path . getsize ( bin_file_path ) == target_size ) :
print (
" Success! The user data output binary file can be found at: {0} " .
format ( bin_file_path ) )
else :
print ( " Conversion failed! " )
def main_preprocess_Imagenet ( args ) :
parser = argparse . ArgumentParser (
description = " Convert the full Imagenet val set or local data to binary file. " ,
usage = None ,
add_help = True )
parser . add_argument (
' --local ' ,
action = " store_true " ,
help = " If used, user need to set --data_dir and then convert file " )
parser . add_argument (
" --data_dir " , default = " " , type = str , help = " Dataset root directory " )
parser . add_argument (
" --label_list " ,
type = str ,
default = " val_list.txt " ,
help = " List of object labels with same sequence as denoted in the annotation file "
)
parser . add_argument (
" --output_file " ,
type = str ,
default = " imagenet_small.bin " ,
help = " File path of the output binary file " )
parser . add_argument (
" --data_dim " ,
type = int ,
default = DATA_DIM ,
help = " Image preprocess with data_dim width and height " )
args = parser . parse_args ( )
if args . local :
convert_Imagenet_local2bin ( args )
else :
run_convert ( )
if __name__ == ' __main__ ' :
run_convert ( )
main_preprocess_Imagenet( sys . argv )