Skip to content

Commit

Permalink
investigate
Browse files Browse the repository at this point in the history
  • Loading branch information
bpasero committed Sep 1, 2022
1 parent 73d0816 commit a5c0f6c
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 19 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
"@vscode/sqlite3": "5.0.8",
"@vscode/sudo-prompt": "9.3.1",
"@vscode/vscode-languagedetection": "1.0.21",
"chardet": "^1.4.0",
"graceful-fs": "4.2.8",
"http-proxy-agent": "^2.1.0",
"https-proxy-agent": "^2.2.3",
Expand Down
24 changes: 5 additions & 19 deletions src/vs/workbench/services/textfile/common/encoding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -316,27 +316,22 @@ const IGNORE_ENCODINGS = ['ascii', 'utf-16', 'utf-32'];
* Guesses the encoding from buffer.
*/
async function guessEncodingByBuffer(buffer: VSBuffer): Promise<string | null> {
const jschardet = await import('jschardet');
const nodechardet = await import('chardet');

// ensure to limit buffer for guessing due to https://github.com/aadsm/jschardet/issues/53
const limitedBuffer = buffer.slice(0, AUTO_ENCODING_GUESS_MAX_BYTES);

// before guessing jschardet calls toString('binary') on input if it is a Buffer,
// since we are using it inside browser environment as well we do conversion ourselves
// https://github.com/aadsm/jschardet/blob/v2.1.1/src/index.js#L36-L40
const binaryString = encodeLatin1(limitedBuffer.buffer);

const guessed = jschardet.detect(binaryString);
if (!guessed || !guessed.encoding) {
const guessed = nodechardet.detect(limitedBuffer.buffer);
if (!guessed) {
return null;
}

const enc = guessed.encoding.toLowerCase();
const enc = guessed.toLowerCase();
if (0 <= IGNORE_ENCODINGS.indexOf(enc)) {
return null; // see comment above why we ignore some encodings
}

return toIconvLiteEncoding(guessed.encoding);
return toIconvLiteEncoding(guessed);
}

const JSCHARDET_TO_ICONV_ENCODINGS: { [name: string]: string } = {
Expand All @@ -351,15 +346,6 @@ function toIconvLiteEncoding(encodingName: string): string {
return mapped || normalizedEncodingName;
}

function encodeLatin1(buffer: Uint8Array): string {
let result = '';
for (let i = 0; i < buffer.length; i++) {
result += String.fromCharCode(buffer[i]);
}

return result;
}

/**
* The encodings that are allowed in a settings file don't match the canonical encoding labels specified by WHATWG.
* See https://encoding.spec.whatwg.org/#names-and-labels
Expand Down
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2551,6 +2551,11 @@ chardet@^0.7.0:
resolved "https://registry.yarnpkg.com/chardet/-/chardet-0.7.0.tgz#90094849f0937f2eedc2425d0d28a9e5f0cbad9e"
integrity sha512-mT8iDcrh03qDGRRmoA2hmBJnxpllMR+0/0qlzjqZES6NdiWDcZkCNAk4rPFZ9Q85r27unkiNNg8ZOiwZXBHwcA==

chardet@^1.4.0:
version "1.4.0"
resolved "https://registry.yarnpkg.com/chardet/-/chardet-1.4.0.tgz#278748f260219990fb2167dbfb1b253ca26b41ea"
integrity sha512-NpwMDdSIprbYx1CLnfbxEIarI0Z+s9MssEgggMNheGM+WD68yOhV7IEA/3r6tr0yTRgQD0HuZJDw32s99i6L+A==

charenc@~0.0.1:
version "0.0.2"
resolved "https://registry.yarnpkg.com/charenc/-/charenc-0.0.2.tgz#c0a1d2f3a7092e03774bfa83f14c0fc5790a8667"
Expand Down

0 comments on commit a5c0f6c

Please sign in to comment.