You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
160 lines
5.4 KiB
160 lines
5.4 KiB
#!/bin/env python
|
|
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
Example:
|
|
python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
|
|
python paraconvert.py --t2b -i INPUT -o OUTPUT
|
|
|
|
Options:
|
|
-h, --help show this help message and exit
|
|
--b2t convert parameter file of embedding model from binary to text
|
|
--t2b convert parameter file of embedding model from text to binary
|
|
-i INPUT input parameter file name
|
|
-o OUTPUT output parameter file name
|
|
-d DIM dimension of parameter
|
|
"""
|
|
from optparse import OptionParser
|
|
import struct
|
|
|
|
|
|
def binary2text(input, output, paraDim):
|
|
"""
|
|
Convert a binary parameter file of embedding model to be a text file.
|
|
input: the name of input binary parameter file, the format is:
|
|
1) the first 16 bytes is filehead:
|
|
version(4 bytes): version of paddle, default = 0
|
|
floatSize(4 bytes): sizeof(float) = 4
|
|
paraCount(8 bytes): total number of parameter
|
|
2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
|
|
output: the name of output text parameter file, for example:
|
|
0,4,32156096
|
|
-0.7845433,1.1937413,-0.1704215,...
|
|
0.0000909,0.0009465,-0.0008813,...
|
|
...
|
|
the format is:
|
|
1) the first line is filehead:
|
|
version=0, floatSize=4, paraCount=32156096
|
|
2) other lines print the paramters
|
|
a) each line prints paraDim paramters splitted by ','
|
|
b) there is paraCount/paraDim lines (embedding words)
|
|
paraDim: dimension of parameters
|
|
"""
|
|
fi = open(input, "rb")
|
|
fo = open(output, "w")
|
|
"""
|
|
"""
|
|
version, floatSize, paraCount = struct.unpack("iil", fi.read(16))
|
|
newHead = ','.join([str(version), str(floatSize), str(paraCount)])
|
|
print >> fo, newHead
|
|
|
|
bytes = 4 * int(paraDim)
|
|
format = "%df" % int(paraDim)
|
|
context = fi.read(bytes)
|
|
line = 0
|
|
|
|
while context:
|
|
numbers = struct.unpack(format, context)
|
|
lst = []
|
|
for i in numbers:
|
|
lst.append('%8.7f' % i)
|
|
print >> fo, ','.join(lst)
|
|
context = fi.read(bytes)
|
|
line += 1
|
|
fi.close()
|
|
fo.close()
|
|
print "binary2text finish, total", line, "lines"
|
|
|
|
|
|
def get_para_count(input):
|
|
"""
|
|
Compute the total number of embedding parameters in input text file.
|
|
input: the name of input text file
|
|
"""
|
|
numRows = 1
|
|
paraDim = 0
|
|
with open(input) as f:
|
|
line = f.readline()
|
|
paraDim = len(line.split(","))
|
|
for line in f:
|
|
numRows += 1
|
|
return numRows * paraDim
|
|
|
|
|
|
def text2binary(input, output, paddle_head=True):
|
|
"""
|
|
Convert a text parameter file of embedding model to be a binary file.
|
|
input: the name of input text parameter file, for example:
|
|
-0.7845433,1.1937413,-0.1704215,...
|
|
0.0000909,0.0009465,-0.0008813,...
|
|
...
|
|
the format is:
|
|
1) it doesn't have filehead
|
|
2) each line stores the same dimension of parameters,
|
|
the separator is commas ','
|
|
output: the name of output binary parameter file, the format is:
|
|
1) the first 16 bytes is filehead:
|
|
version(4 bytes), floatSize(4 bytes), paraCount(8 bytes)
|
|
2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
|
|
"""
|
|
fi = open(input, "r")
|
|
fo = open(output, "wb")
|
|
|
|
newHead = struct.pack("iil", 0, 4, get_para_count(input))
|
|
fo.write(newHead)
|
|
|
|
count = 0
|
|
for line in fi:
|
|
line = line.strip().split(",")
|
|
for i in range(0, len(line)):
|
|
binary_data = struct.pack("f", float(line[i]))
|
|
fo.write(binary_data)
|
|
count += 1
|
|
fi.close()
|
|
fo.close()
|
|
print "text2binary finish, total", count, "lines"
|
|
|
|
|
|
def main():
|
|
"""
|
|
Main entry for running paraconvert.py
|
|
"""
|
|
usage = "usage: \n" \
|
|
"python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
|
|
"python %prog --t2b -i INPUT -o OUTPUT"
|
|
parser = OptionParser(usage)
|
|
parser.add_option(
|
|
"--b2t",
|
|
action="store_true",
|
|
help="convert parameter file of embedding model from binary to text")
|
|
parser.add_option(
|
|
"--t2b",
|
|
action="store_true",
|
|
help="convert parameter file of embedding model from text to binary")
|
|
parser.add_option(
|
|
"-i", action="store", dest="input", help="input parameter file name")
|
|
parser.add_option(
|
|
"-o", action="store", dest="output", help="output parameter file name")
|
|
parser.add_option(
|
|
"-d", action="store", dest="dim", help="dimension of parameter")
|
|
(options, args) = parser.parse_args()
|
|
if options.b2t:
|
|
binary2text(options.input, options.output, options.dim)
|
|
if options.t2b:
|
|
text2binary(options.input, options.output)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|