Reading bytes from a JavaScript string

Monday, April 23, 2012

Reading bytes from a JavaScript string

I have a string containing binary data in JavaScript. Now I want to read, for example, an integer from it. So I get the first 4 characters, use charCodeAt , do some shifting, etc. to get an integer.

The problem is that strings in JavaScript are UTF-16 (instead of ASCII) and charCodeAt often returns values higher than 256.

The Mozilla reference states that "The first 128 Unicode code points are a direct match of the ASCII character encoding." (what about ASCII values > 128?).

How can I convert the result of charCodeAt to an ASCII value? Or is there a better way to convert a string of four characters to a 4 byte integer?

Source: Tips4all

8 comments:

UserApril 23, 2012 at 8:06 PM
I believe that you can can do this with relatively simple bit operations:

function stringToBytes ( str ) {
var ch, st, re = [];
for (var i = 0; i < str.length; i++ ) {
ch = str.charCodeAt(i); // get char
st = []; // set up "stack"
do {
st.push( ch & 0xFF ); // push byte to stack
ch = ch >> 8; // shift value down by 1 byte
}
while ( ch );
// add stack contents to result
// done because chars have "wrong" endianness
re = re.concat( st.reverse() );
}
// return an array of bytes
return re;
}

stringToBytes( "A\u1242B\u4123C" ); // [65, 18, 66, 66, 65, 35, 67]

It should be a simple matter to sum the output up by reading the byte array as if it were memory and adding it up into larger numbers:

function getIntAt ( arr, offs ) {
return (arr[offs+0] << 24) +
(arr[offs+1] << 16) +
(arr[offs+2] << 8) +
arr[offs+3];
}

function getWordAt ( arr, offs ) {
return (arr[offs+0] << 8) +
arr[offs+1];
}

'\\u' + getWordAt( stringToBytes( "A\u1242" ), 1 ).toString(16); // "1242"
ReplyDelete
Replies
UserApril 23, 2012 at 8:06 PM
Borgar's answer seems correct.

Just wanted to clarify one point. Javascript treats bitwise operations as '32-bit signed int's, where the last (left-most) bit is the sign bit. Ie,

getIntAt([0x7f,0,0,0],0).toString(16) // "7f000000"

getIntAt([0x80,0,0,0],0).toString(16) // "-80000000"

However, for octet-data processing (eg, network stream, etc), usually want the 'unsigned int' representation. This can be accomplished by adding a '>>> 0' (zero-fill right-shift) operator which internally tells Javascript to treat this as unsigned.

function getUIntAt ( arr, offs ) {
return (arr[offs+0] << 24) +
(arr[offs+1] << 16) +
(arr[offs+2] << 8) +
arr[offs+3] >>> 0;
}

getUIntAt([0x80,0,0,0],0).toString(16) // "80000000"
ReplyDelete
Replies
UserApril 23, 2012 at 8:06 PM
Borga's solution works perfectly. In case you want a more concrete implementation, you may want to have a look at the BinaryReader class from vjeux (which, for the records, is based on the binary-parser class from Jonas Raoni Soares Silva).
ReplyDelete
Replies
UserApril 23, 2012 at 8:06 PM
How did you get the binary data into the string in the first place? How the binary data gets encoded into a string is an IMPORTANT consideration, and you need an answer to that question before you can proceed.

One way I know of to get binary data into a string, is to use the XHR object, and set it to expect UTF-16.

Once it's in utf-16, you can retrieve 16-bit numbers from the string using "....".charCodeAt(0)

which will be a number between 0 and 65535

Then, if you like, you can convert that number into two numbers between 0 and 255 like this:

var leftByte = mynumber>>>8;
var rightByte = mynumber&255;
ReplyDelete
Replies
UserApril 23, 2012 at 8:06 PM
There are two methods for encoding and decoding utf-8 string to a byte array and back.

var utf8 = {}

utf8.toByteArray = function(str) {
var byteArray = [];
for (var i = 0; i < str.length; i++)
if (str.charCodeAt(i) <= 0x7F)
byteArray.push(str.charCodeAt(i));
else {
var h = encodeURIComponent(str.charAt(i)).substr(1).split('%');
for (var j = 0; j < h.length; j++)
byteArray.push(parseInt(h[j], 16));
}
return byteArray;
};

utf8.parse = function(byteArray) {
var str = '';
for (var i = 0; i < byteArray.length; i++)
str += byteArray[i] <= 0x7F?
byteArray[i] === 0x25 ? "%25" : // %
String.fromCharCode(byteArray[i]) :
"%" + byteArray[i].toString(16).toUpperCase();
return decodeURIComponent(str);
};

// sample
var str = "Да!";
var ba = utf8.toByteArray(str);
alert(ba); // 208, 148, 208, 176, 33
alert(ba.length); // 5
alert(utf8.parse(ba)); // Да!
ReplyDelete
Replies
UserApril 23, 2012 at 8:06 PM
While @Borgar answers the question correctly, his solution is pretty slow. It took me a while to track it down (I used his function somewhere in a larger project), so I thought I would share my insight.

I ended up having something like @Kadm. It's not some little percent faster, it's like 500 times faster (no exaggeration!). I wrote a little benchmark, so you can see it for yourself :)

function stringToBytesFaster ( str ) {
var ch, st, re = [], j=0;
for (var i = 0; i < str.length; i++ ) {
ch = str.charCodeAt(i);
if(ch < 127)
{
re[j++] = ch & 0xFF;
}
else
{
st = []; // clear stack
do {
st.push( ch & 0xFF ); // push byte to stack
ch = ch >> 8; // shift value down by 1 byte
}
while ( ch );
// add stack contents to result
// done because chars have "wrong" endianness
st = st.reverse();
for(var k=0;k<st.length; ++k)
re[j++] = st[k];
}
}
// return an array of bytes
return re;
}
ReplyDelete
Replies
UserApril 23, 2012 at 8:06 PM
I'm going to assume for a second that your objective is to read arbitrary bytes from a string.
My first suggestion would be to make your string representation a hexidecmal representation of the binary data.

You can read the values using conversions to numbers from hex:

var BITS_PER_BYTE = 8;

function readBytes(hexString, numBytes) {
return Number( parseInt( hexString.substr(0, numBytes * (BITS_PER_BYTE/4) ),16 ) );
}

function removeBytes(hexString, numBytes) {
return hexString.substr( numBytes * (BITS_PER_BYTE/BITS_PER_CHAR) );
}

The functions can then be used to read whatever you want:

var hex = '4ef2c3382fd';
alert( 'We had: ' + hex );

var intVal = readBytes(hex,2);
alert( 'Two bytes: ' + intVal.toString(2) );

hex = removeBytes(hex,2);
alert( 'Now we have: ' + hex );

You can then interpret the byte string however you want.

Hope this helps!
Cheers!
ReplyDelete
Replies
UserApril 23, 2012 at 8:06 PM
borgars solution improvement:

...
do {
st.unshift( ch & 0xFF ); // push byte to stack
ch = ch >> 8; // shift value down by 1 byte
}
while ( ch );
// add stack contents to result
// done because chars have "wrong" endianness
re = re.concat( st );
...
ReplyDelete
Replies

Add comment

Ccna final exam - java, php, javascript, ios, cshap all in one

Monday, April 23, 2012

Reading bytes from a JavaScript string

8 comments:

Total Pageviews