JavaScriptCore is very strict about invalid UTF symbols.

dryganets · dryganets · commit 89975d76b793 · 2017-07-31T13:35:09.000-07:00
So if you pass an invalid UTF-8 string to it the string will be decoded as an empty string.

The current implementation of progressive downloading for Android blindly cuts the response in 8KB chunks.
That could cause a problem in case the last symbol in the chunk is multi-byte.

To prevent it I added a class which determines if this is the case and cut the string in the appropriate place.
A remainder is prepended to the next chunk of data.
diff --git a/ReactAndroid/src/main/java/com/facebook/react/common/StandardCharsets.java b/ReactAndroid/src/main/java/com/facebook/react/common/StandardCharsets.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+package com.facebook.react.common;
+
+import java.nio.charset.Charset;
+
+/**
+ * Not all versions of Android SDK have this class in nio package.
+ * This is the reason to have it around.
+ */
+public class StandardCharsets {
+
+  /**
+   * Eight-bit UCS Transformation Format
+   */
+  public static final Charset UTF_8 = Charset.forName("UTF-8");
+
+  /**
+   * Sixteen-bit UCS Transformation Format, byte order identified by an
+   * optional byte-order mark
+   */
+  public static final Charset UTF_16 = Charset.forName("UTF-16");
+
+  /**
+   * Sixteen-bit UCS Transformation Format, big-endian byte order
+   */
+  public static final Charset UTF_16BE = Charset.forName("UTF-16BE");
+  /**
+   * Sixteen-bit UCS Transformation Format, little-endian byte order
+   */
+  public static final Charset UTF_16LE = Charset.forName("UTF-16LE");
+}
diff --git a/ReactAndroid/src/main/java/com/facebook/react/modules/network/NetworkingModule.java b/ReactAndroid/src/main/java/com/facebook/react/modules/network/NetworkingModule.java
@@ -14,6 +14,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
+import java.nio.charset.Charset;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -29,6 +30,7 @@
 import com.facebook.react.bridge.ReadableArray;
 import com.facebook.react.bridge.ReadableMap;
 import com.facebook.react.bridge.WritableMap;
+import com.facebook.react.common.StandardCharsets;
 import com.facebook.react.common.network.OkHttpCallUtil;
 import com.facebook.react.module.annotations.ReactModule;
 import com.facebook.react.modules.core.DeviceEventManagerModule.RCTDeviceEventEmitter;
@@ -408,20 +410,45 @@ private void readWithProgress(
       // Ignore
     }
 
-    Reader reader = responseBody.charStream();
-    try {
-      char[] buffer = new char[MAX_CHUNK_SIZE_BETWEEN_FLUSHES];
-      int read;
-      while ((read = reader.read(buffer)) != -1) {
-        ResponseUtil.onIncrementalDataReceived(
-          eventEmitter,
-          requestId,
-          new String(buffer, 0, read),
-          totalBytesRead,
-          contentLength);
+    Charset charset = responseBody.contentType() == null ? StandardCharsets.UTF_8 :
+      responseBody.contentType().charset(StandardCharsets.UTF_8);
+
+    if (StandardCharsets.UTF_8.equals(charset)) {
+      ProgressiveUTF8StreamDecoder streamDecoder = new ProgressiveUTF8StreamDecoder();
+      InputStream inputStream = responseBody.byteStream();
+      try {
+        byte[] buffer = new byte[MAX_CHUNK_SIZE_BETWEEN_FLUSHES];
+        int read;
+        while ((read = inputStream.read(buffer)) != -1) {
+          ResponseUtil.onIncrementalDataReceived(
+            eventEmitter,
+            requestId,
+            streamDecoder.decodeNext(buffer, read),
+            totalBytesRead,
+            contentLength);
+        }
+      } finally {
+        inputStream.close();
+      }
+    } else {
+      // TODO: in UTF-16 some symbols took 4 bytes or 2 chars (HIGH and LOW surrogates)
+      // Ideally we need to take care of this but it's way more complex task as it involves handling
+      // of Byte Order Mark and little/big endian of UTF-16. Let's keep it in sync with iOS for now.
+      Reader reader = responseBody.charStream();
+      try {
+        char[] buffer = new char[MAX_CHUNK_SIZE_BETWEEN_FLUSHES];
+        int read;
+        while ((read = reader.read(buffer)) != -1) {
+          ResponseUtil.onIncrementalDataReceived(
+            eventEmitter,
+            requestId,
+            new String(buffer, 0, read),
+            totalBytesRead,
+            contentLength);
+        }
+      } finally {
+        reader.close();
       }
-    } finally {
-      reader.close();
     }
   }
 
diff --git a/ReactAndroid/src/main/java/com/facebook/react/modules/network/ProgressiveUTF8StreamDecoder.java b/ReactAndroid/src/main/java/com/facebook/react/modules/network/ProgressiveUTF8StreamDecoder.java
@@ -0,0 +1,88 @@
+/**
+* Copyright (c) 2017-present, Facebook, Inc.
+* All rights reserved.
+*
+* This source code is licensed under the BSD-style license found in the
+* LICENSE file in the root directory of this source tree. An additional grant
+* of patent rights can be found in the PATENTS file in the same directory.
+*/
+package com.facebook.react.modules.network;
+
+import com.facebook.react.common.StandardCharsets;
+
+/**
+* Class to decode UTF-8 strings from byte array chunks.
+* UTF-8 could have symbol size from 1 to 4 bytes.
+* In case of progressive decoding we could accidentally break the original string.
+*
+* Use this class to make sure that we extract Strings from byte stream correctly.
+*/
+public class ProgressiveUTF8StreamDecoder {
+
+  private byte[] mRemainder = null;
+
+  /**
+   * Bit mask implementation performed 1.5x worse than this one
+   *
+   * @param firstByte - first byte of the symbol
+   * @return count of bytes in the symbol
+   */
+  private int symbolSize(byte firstByte) {
+    int code = firstByte & 0XFF;
+    if (code >= 240) {
+        return 4;
+    } else if (code >= 224 ) {
+        return 3;
+    } else if (code >= 192 ) {
+        return 2;
+    }
+
+    return 1;
+  }
+
+  /**
+   * Parses data to UTF-8 String
+   * If last symbol is partial we save it to mRemainder and concatenate it to the next chunk
+   * @param data
+   * @param length length of data to decode
+   * @return
+   */
+  public String decodeNext(byte[] data, int length) {
+    int i = 0;
+    int lastSymbolSize = 0;
+    if (mRemainder != null) {
+      i = symbolSize(mRemainder[0]) - mRemainder.length;
+    }
+    while (i < length) {
+      lastSymbolSize = symbolSize(data[i]);
+      i += lastSymbolSize;
+
+    }
+
+    byte[] result;
+    int symbolsToCopy = length;
+    boolean hasNewReminder = false;
+    if (i > length) {
+      hasNewReminder = true;
+      symbolsToCopy = i - lastSymbolSize;
+    }
+
+    if (mRemainder == null) {
+      result = data;
+    } else {
+      result = new byte[symbolsToCopy + mRemainder.length];
+      System.arraycopy(mRemainder, 0, result, 0, mRemainder.length);
+      System.arraycopy(data, 0, result, mRemainder.length, symbolsToCopy);
+      mRemainder = null;
+      symbolsToCopy = result.length;
+    }
+
+    if (hasNewReminder) {
+      int reminderSize =  lastSymbolSize - i + length;
+      mRemainder = new byte[reminderSize];
+      System.arraycopy(data, length - reminderSize, mRemainder, 0, reminderSize );
+    }
+
+    return new String(result, 0, symbolsToCopy, StandardCharsets.UTF_8);
+  }
+}
diff --git a/ReactAndroid/src/test/java/com/facebook/react/modules/network/ProgressiveUTF8StreamDecoderTest.java b/ReactAndroid/src/test/java/com/facebook/react/modules/network/ProgressiveUTF8StreamDecoderTest.java
@@ -0,0 +1,108 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree. An additional grant
+ * of patent rights can be found in the PATENTS file in the same directory.
+ */
+package com.facebook.react.modules.network;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.robolectric.RobolectricTestRunner;
+
+import java.nio.charset.Charset;
+
+
+@RunWith(RobolectricTestRunner.class)
+public class ProgressiveUTF8StreamDecoderTest {
+
+  private static String TEST_DATA_1_BYTE = "Lorem ipsum dolor sit amet, ea ius viris laoreet gloriatur, ea enim illud mel. Ea eligendi erroribus inciderint sea, id nemore sensibus contentiones qui. Eos et nulla abhorreant, noluisse adipiscing reprehendunt an sit. Harum iriure meliore ne nec, clita semper voluptaria at sea. Ius civibus vituperata reprehendunt ut.\n" +
+    "\n" +
+    "Sed nisl postea maiorum ex, mea eros verterem ea. Ne usu brute debitis appareat. Ad quem reprimique dissentias duo. Sit an labitur eleifend, illud zril audiam nam ex, epicuri luptatum ne usu. Lorem mundi utinam vix ea.\n" +
+    "\n" +
+    "Te eam nominati qualisque. Ut praesent consetetur pro. Soleat vivendum vim ea. Altera dolores eam in. Eum at praesent complectitur. Nec ea inani definitiones, tantas vivendum mei an, mea an ubique omnium latine. Has mundi ocurreret ei, nam ea iuvaret gloriatur.\n" +
+    "\n" +
+    "Ad omnes malorum vim, no latine facilisi mel, dicant salutandi conclusionemque ei est. Nam cu partem alterum minimum. Et quo iriure deleniti accommodare, ad impetus perfecto liberavisse pri. Instructior necessitatibus ut mel, ex cum sumo atqui comprehensam, ei nullam oporteat sed. Ius meliore placerat cu.\n" +
+    "\n" +
+    "Eum in ferri nobis, eam eu verear facilisis referrentur. Veniam epicuri referrentur at nam. Vel congue diceret fabulas te, ei fabellas temporibus mei. Nemore corrumpit quo ex, et vis soluta reprehendunt. Et eos eripuit atomorum.\n" +
+    "\n" +
+    "Eum no novum tantas decore. Indoctum definiebas intellegam ut vel. Cu per ipsum graeco, in nam dico dolore, usu id ludus consulatu. Vis an clita commune, cu quot quaeque cum. In eos semper aperiri. Ne mea probo inermis, no vis audiam volutpat.\n" +
+    "\n" +
+    "Cu quaeque scaevola vis. Civibus commune scriptorem vim an, vim ea vocent petentium consequuntur, meis propriae invidunt eam ex. Pro et ponderum recusabo sapientem. Vel legere possim ornatus ne, saepe commodo scaevola an quo. An scaevola repudiandae sed. Eam ei veri nemore.\n" +
+    "\n" +
+    "Ullum deleniti cum at. An has soleat docendi, epicuri erroribus inciderint pro ea. Noluisse invidunt splendide quo in, eam odio invenire ea. Eu hinc definiebas scripserit duo, has cu equidem ponderum expetenda, eum vulputate intellegat id. Pri eu natum semper pertinax, ei vel inani aliquip habemus, sit an facer dicam. Et graeci abhorreant contentiones duo, et summo partiendo conclusionemque per.\n" +
+    "\n" +
+    "Sed ei etiam iudico abhorreant. Pri an regione fastidii, clita discere eu nec. Torquatos percipitur inciderint eos in, id per prompta blandit. Sit et epicuri deleniti. Per labores corpora no.\n" +
+    "\n" +
+    "Quodsi melius facilis pri ei, has adhuc recusabo reprimique ut. Laoreet definitionem cum cu, amet nonumes ut vis, qui ut sonet ancillae. Vim no doctus efficiantur, ancillae indoctum ex sea, vel eu fabulas volumus argumentum. Ex eum aeque commune placerat, nam choro tamquam luptatum et. Ne sea vero idque liberavisse";
+
+  private static String TEST_DATA_2_BYTES = "Лорем ипсум долор сит амет, доминг дисцере ад вих, велит игнота ратионибус мел цу. Не вирис малорум яуаеяуе хас, еу либрис доцтус хис. Моллис садипсцинг ан цум, семпер молестие репрехендунт усу те. Цасе аетерно оффендит ан еос. При ан толлит опортере оцурререт, ан яуот мутат трацтатос вих.\n" +
+    "\n" +
+    "Нец фалли харум ратионибус еа. Магна адмодум ат нам, яуи еа рецусабо мандамус, аццусам цонсеяуунтур цу хис. Импедит цотидиеяуе улламцорпер еа мел, усу ет долорес аргументум. Веро торяуатос ех нам, цибо либерависсе ест еи. Вис долор омниум сплендиде ад, велит рецусабо цонсететур иус цу.\n" +
+    "\n" +
+    "Еи дуо меис атоморум сигниферумяуе, аугуе аццусам мел ет. Ут ностро легендос хонестатис пер, ут яуас мовет сеа. Меа цу продессет аппеллантур. Вис еа яуод оффендит, дебет видерер ет нам.\n" +
+    "\n" +
+    "Еам еа дебитис иудицабит, не хас иллуд цивибус. Усу ет алии уллум утамур. Поссит цонституто те яуи, хас ет лаудем аудире, нам еи епицури салутанди. Лудус делицатиссими цум еу, либер адиписцинг еи нец. Ид ерипуит лобортис антиопам хис, санцтус елигенди неглегентур сед ут, вел сентентиае инструцтиор еи. Ан про унум яуалисяуе.\n" +
+    "\n" +
+    "Ат еррор алтера сит, пер еу яуот номинави. Пертинах репудиаре цум еу. Еа фуиссет антиопам вим, пробатус реферрентур ут иус. Еум ад модус утрояуе диспутандо.\n" +
+    "\n" +
+    "Ехерци бландит ут меа. Солет импедит сед ад. Дуо порро тимеам аудире не, алии ерант номинави цу нец, сит ферри веритус адиписци те. Те меи синт адверсариум, ад феугаит инвидунт луцилиус сед, дицунт нумяуам нам те. Еум дицант елеифенд цонсецтетуер ет, суммо вереар епицуреи не про. Не лудус сцрипта опортере вим, еи дуо идяуе алияуам сигниферумяуе. Цум еу лабитур инвенире, про ессе губергрен темпорибус еи, ад хис минимум пертинах.\n" +
+    "\n" +
+    "Дуо ад вери евертитур интеллегат, демоцритум еффициенди дуо ет. Нец но доценди демоцритум сцрипторем, витуперата цонституам нецесситатибус ут вим. Яуи виде санцтус мандамус ан, нонумес принципес вел ат, ех дуо инани нулла. Петентиум маиестатис еам ин, те ерант дебитис еурипидис вис. Но вел антиопам цотидиеяуе еффициантур, сеа еи нибх нонумы инцидеринт.\n" +
+    "\n" +
+    "Одио омнес но яуо, популо ноструд иус ад. Инани хонестатис но вис. Хис еу лудус партем персиус, пурто малис витуперата при ан, еи елаборарет ассуеверит вим. Цу бруте утинам тинцидунт вих, цум ад дицтас лобортис лаборамус. Нец хабемус рецусабо ат, ех фацилис денияуе ест. При те велит алияуам аццусамус, юсто утамур антиопам но нам.\n" +
+    "\n" +
+    "Про не еррем иудицо мелиоре, еи цибо ерудити санцтус хас. Яуод еяуидем еу вис, вих яуидам легимус ад, ид сеа солум легере мандамус. Аеяуе детрахит ех иус, суас вертерем еум цу. Еи вим алиа ехерци пхаедрум, хас не лаборес цоррумпит. Ат граеци сцрипта вим.\n" +
+    "\n" +
+    "Иус ат менандри персеяуерис. Про модус дицта еу, ин граеци доценди фиерент при, еи хас аугуе мандамус дефинитионем. Ет путент интерпретарис сит, перицула сентентиае ат ест. При ут сумо видит волуптатибус, нобис деленити еа.";
+  private static String TEST_DATA_3_BYTES = "案のづよド捕毎エオ文疑ろめた今宮レ秋像とが供持属ょー真場中ホサヒ不箱らご著質ーぼンろ保6年読さ系蔵べるル緩参フシセタ鮮県フずッ歳民ナセ楽飲匹恒桜ぱ。要電ネソメ嘉負向ス援中ぜく界党フネ属平ぎ象越容レ書95争効99争効7翌テ売約わこよッ紙点発事9入そさ補綱のラず他亭匠ぞ。\n" +
+    "\n" +
+    "天レ供内ソ愛7読でぽせ回書ほごしな浅月企設潟せぐり裂個ホヌヤ局題制エ柏央ざぽ。外くにさ下格か終所あ硬当ワ着少選とけリへ康件終にぎ季規らおず給測トユテ考毎サトス事版にーご文8忙チ深暮タヲムラ度6応しぞぎぐ装速て続際ぞ発准揮包孤てい。制はたちき合南む乙甲ゅさと捕4球任条こでン頭広セスモウ月夜エス面陽ヨネ力京ウリ紙聞ト印2火映ラ基頭スフ点愛伎協ねド。\n" +
+    "\n" +
+    "属と共代みむもず以監すい者新ス田政家ヱス使校音刑トホ則上ゅぐ一未ヌ意40芸標んは学必強ゅ帝歯没牧具もか。58新イシレ正米ニユ負皇っぐせの必容キソタコ公3容ーつぶべ年然検ざ整賞ニチ注興ぐ放約えあ野夜磨やゃフよ。柳ソシアテ申1科ル舗紀深むぜ競供とび室全ハネ測高エラク権暮ヲクオト館暮ヌ黒杯クリぴぽ火竹ねる種4帰替やあい北問クルゃン登壌粉つどべ。";
+
+  private static final String TEST_DATA_4_BYTES ="\uD800\uDE55\uD800\uDE55\uD800\uDE55 \uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55" +
+    "\uD800\uDE55\uD800\uDE55\uD800\uDE55 \uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80" +
+    "\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80" +
+    "\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE55\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80\uD800\uDE80" +
+    "\uD800\uDE80\uD800\uDE80\uD800\uDE80";
+
+  @Test
+  public void testUnicode1Byte() {
+    chunkString(TEST_DATA_1_BYTE, 64);
+  }
+
+  @Test
+  public void testUnicode2Bytes() {
+    chunkString(TEST_DATA_2_BYTES, 63);
+  }
+
+  @Test
+  public void testUnicode3Bytes() throws Exception {
+    chunkString(TEST_DATA_3_BYTES, 64);
+  }
+
+  @Test
+  public void testUnicode4Bytes() throws Exception {
+    chunkString(TEST_DATA_4_BYTES, 111);
+  }
+
+  private void chunkString(String originalString, int chunkSize) {
+    byte data [] = originalString.getBytes(Charset.forName("UTF-8"));
+
+    StringBuilder builder = new StringBuilder();
+    ProgressiveUTF8StreamDecoder collector = new ProgressiveUTF8StreamDecoder();
+    byte[] buffer = new byte[chunkSize];
+    for (int i = 0; i < data.length; i+= chunkSize) {
+      int bytesRead = Math.min(chunkSize, data.length - i);
+      System.arraycopy(data,  i, buffer, 0, bytesRead );
+      builder.append(collector.decodeNext(buffer, bytesRead ));
+    }
+
+    String actualString = builder.toString();
+    Assert.assertEquals(originalString, actualString);
+  }
+}