diff --git a/.editorconfig b/.editorconfig index f9b5a3053..f456380ee 100644 --- a/.editorconfig +++ b/.editorconfig @@ -2,12 +2,18 @@ root = true [*] +charset = utf-8 # standardize on no BOM (except resx, see below) indent_style = tab indent_size = 4 guidelines = 110 tab_width = 4 end_of_line = crlf +# .net tooling writes the BOM to resx files on running dotnet build ILSpy.sln regardless, +# so we should direct text editors to NOT change the file +[*.resx] +charset = utf-8-bom + [*.il] indent_style = space indent_size = 2 diff --git a/BuildTools/bom-classify-encodings.ps1 b/BuildTools/bom-classify-encodings.ps1 new file mode 100644 index 000000000..269d25894 --- /dev/null +++ b/BuildTools/bom-classify-encodings.ps1 @@ -0,0 +1,171 @@ +<# +.SYNOPSIS +Classify text files by encoding under the current subtree, respecting .gitignore. + +.DESCRIPTION +Enumerates tracked files and untracked-but-not-ignored files (via Git) beneath +PWD. Skips likely-binary files (NUL probe). Classifies remaining files as: + - 'utf8' : valid UTF-8 (no BOM) or empty file + - 'utf8-with-bom' : starts with UTF-8 BOM (EF BB BF) + - 'other' : text but not valid UTF-8 (e.g., UTF-16/ANSI) + +Outputs: + 1) Relative paths of files classified as 'other' + 2) A table by extension: UTF8 / UTF8-with-BOM / Other / Total + +Notes: + - Read-only: this script makes no changes. + - Requires Git and must be run inside a Git work tree. +#> + +[CmdletBinding()] +param() + +Set-StrictMode -Version Latest +$ErrorActionPreference = 'Stop' + +# --- Git enumeration --------------------------------------------------------- +function Assert-InGitWorkTree { + # Throws if not inside a Git work tree. + $inside = (& git rev-parse --is-inside-work-tree 2>$null).Trim() + if ($LASTEXITCODE -ne 0 -or $inside -ne 'true') { + throw 'Not in a Git work tree.' + } +} + +function Get-GitFilesUnderPwd { + <# + Returns full paths to tracked + untracked-not-ignored files under PWD. + #> + Assert-InGitWorkTree + + $repoRoot = (& git rev-parse --show-toplevel).Trim() + $pwdPath = (Get-Location).Path + + # cached (tracked) + others (untracked not ignored) + $nulSeparated = & git -C $repoRoot ls-files -z --cached --others --exclude-standard + + $relativePaths = $nulSeparated.Split( + [char]0, [System.StringSplitOptions]::RemoveEmptyEntries) + + foreach ($relPath in $relativePaths) { + $fullPath = Join-Path $repoRoot $relPath + + # Only include files under the current subtree. + if ($fullPath.StartsWith($pwdPath, + [System.StringComparison]::OrdinalIgnoreCase)) { + if (Test-Path -LiteralPath $fullPath -PathType Leaf) { $fullPath } + } + } +} + +# --- Probes ------------------------------------------------------------------ +function Test-ProbablyBinary { + # Heuristic: treat as binary if the first 8 KiB contains any NUL byte. + param([Parameter(Mandatory)][string]$Path) + + try { + $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') + try { + $len = [int][Math]::Min(8192,$stream.Length) + if ($len -le 0) { return $false } + + $buffer = [byte[]]::new($len) + [void]$stream.Read($buffer,0,$len) + return ($buffer -contains 0) + } + finally { $stream.Dispose() } + } + catch { return $false } +} + +function Get-TextEncodingCategory { + # Returns 'utf8', 'utf8-with-bom', 'other', or $null for likely-binary. + param([Parameter(Mandatory)][string]$Path) + + $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') + try { + $fileLength = $stream.Length + if ($fileLength -eq 0) { return 'utf8' } + + # BOM check (EF BB BF) + $header = [byte[]]::new([Math]::Min(3,$fileLength)) + [void]$stream.Read($header,0,$header.Length) + if ($header.Length -ge 3 -and + $header[0] -eq 0xEF -and $header[1] -eq 0xBB -and $header[2] -eq 0xBF) { + return 'utf8-with-bom' + } + + # Quick binary probe before expensive decoding + $stream.Position = 0 + $sampleLen = [int][Math]::Min(8192,$fileLength) + $sample = [byte[]]::new($sampleLen) + [void]$stream.Read($sample,0,$sampleLen) + if ($sample -contains 0) { return $null } + } + finally { $stream.Dispose() } + + # Validate UTF-8 by decoding with throw-on-invalid option (no BOM). + try { + $bytes = [System.IO.File]::ReadAllBytes($Path) + $utf8 = [System.Text.UTF8Encoding]::new($false,$true) + [void]$utf8.GetString($bytes) + return 'utf8' + } + catch { return 'other' } +} + +# --- Main -------------------------------------------------------------------- +$otherFiles = @() +$byExtension = @{} + +$allFiles = Get-GitFilesUnderPwd + +foreach ($fullPath in $allFiles) { + # Avoid decoding likely-binary files. + if (Test-ProbablyBinary $fullPath) { continue } + + $category = Get-TextEncodingCategory $fullPath + if (-not $category) { continue } + + $ext = [IO.Path]::GetExtension($fullPath).ToLower() + if (-not $byExtension.ContainsKey($ext)) { + $byExtension[$ext] = @{ 'utf8' = 0; 'utf8-with-bom' = 0; 'other' = 0 } + } + + $byExtension[$ext][$category]++ + + if ($category -eq 'other') { + $otherFiles += (Resolve-Path -LiteralPath $fullPath -Relative) + } +} + +# 1) Files in 'other' +if ($otherFiles.Count -gt 0) { + 'Files classified as ''other'':' + $otherFiles | Sort-Object | ForEach-Object { " $_" } + '' +} + +# 2) Table by extension +$rows = foreach ($kv in $byExtension.GetEnumerator()) { + $ext = if ($kv.Key) { $kv.Key } else { '[noext]' } + $u = [int]$kv.Value['utf8'] + $b = [int]$kv.Value['utf8-with-bom'] + $o = [int]$kv.Value['other'] + + [PSCustomObject]@{ + Extension = $ext + UTF8 = $u + 'UTF8-with-BOM' = $b + Other = $o + Total = $u + $b + $o + } +} + +$rows | + Sort-Object -Property ( + @{Expression='Total';Descending=$true}, + @{Expression='Extension';Descending=$false} + ) | + Format-Table -AutoSize diff --git a/BuildTools/bom-strip.ps1 b/BuildTools/bom-strip.ps1 new file mode 100644 index 000000000..7d89d98ac --- /dev/null +++ b/BuildTools/bom-strip.ps1 @@ -0,0 +1,208 @@ +<# +.SYNOPSIS +Strip UTF-8 BOM from selected text files under the current subtree, respecting +.gitignore. + +.DESCRIPTION +Enumerates tracked and untracked-but-not-ignored files under the current +directory (via Git), filters to texty extensions and dotfiles, skips likely +binary files (NUL probe), and removes a leading UTF-8 BOM (EF BB BF) in place. + +Refuses to run if there are uncommitted changes as a safeguard. Use -Force to override. +Supports -WhatIf/-Confirm via ShouldProcess. +#> + +[CmdletBinding(SupportsShouldProcess = $true, ConfirmImpact = 'Low')] +param( + [switch]$Force +) + +Set-StrictMode -Version Latest +$ErrorActionPreference = 'Stop' + +# --- File sets (ILSpy) ------------------------------------------------------ +$Dotfiles = @( + '.gitignore', '.editorconfig', '.gitattributes', '.gitmodules', + '.tgitconfig', '.vsconfig' +) + +$AllowedExts = @( + '.bat','.config','.cs','.csproj','.css','.filelist','.fs','.html','.il', + '.ipynb','.js','.json','.less','.manifest','.md','.projitems','.props', + '.ps1','.psd1','.ruleset','.shproj','.sln','.slnf','.svg','.template', + '.tt', '.txt','.vb','.vsct','.vsixlangpack','.wxl','.xaml','.xml','.xshd','.yml' +) + +$IncludeNoExt = $true # include names like LICENSE + +# --- Git checks / enumeration ----------------------------------------------- +function Assert-InGitWorkTree { + $inside = (& git rev-parse --is-inside-work-tree 2>$null).Trim() + if ($LASTEXITCODE -ne 0 -or $inside -ne 'true') { + throw 'Not in a Git work tree.' + } +} + +function Assert-CleanWorkingTree { + if ($Force) { return } + + $status = & git status --porcelain -z + if ($LASTEXITCODE -ne 0) { throw 'git status failed.' } + + if (-not [string]::IsNullOrEmpty($status)) { + throw 'Working tree not clean. Commit/stash changes or use -Force.' + } +} + +function Get-GitFilesUnderPwd { + Assert-InGitWorkTree + + $repoRoot = (& git rev-parse --show-toplevel).Trim() + $pwdPath = (Get-Location).Path + + $tracked = & git -C $repoRoot ls-files -z + $others = & git -C $repoRoot ls-files --others --exclude-standard -z + + $allRel = ("$tracked$others").Split( + [char]0, [System.StringSplitOptions]::RemoveEmptyEntries) + + foreach ($relPath in $allRel) { + $fullPath = Join-Path $repoRoot $relPath + if ($fullPath.StartsWith($pwdPath, + [System.StringComparison]::OrdinalIgnoreCase)) { + if (Test-Path -LiteralPath $fullPath -PathType Leaf) { + $fullPath + } + } + } +} + +# --- Probes ----------------------------------------------------------------- +function Test-HasUtf8Bom { + param([Parameter(Mandatory)][string]$Path) + + try { + $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') + try { + if ($stream.Length -lt 3) { return $false } + + $header = [byte[]]::new(3) + [void]$stream.Read($header,0,3) + + return ($header[0] -eq 0xEF -and + $header[1] -eq 0xBB -and + $header[2] -eq 0xBF) + } + finally { + $stream.Dispose() + } + } + catch { return $false } +} + +function Test-ProbablyBinary { + # Binary if the first 8 KiB contains any NUL byte. + param([Parameter(Mandatory)][string]$Path) + + try { + $stream = [System.IO.File]::Open($Path,'Open','Read','ReadWrite') + try { + $len = [int][Math]::Min(8192,$stream.Length) + if ($len -le 0) { return $false } + + $buffer = [byte[]]::new($len) + [void]$stream.Read($buffer,0,$len) + + return ($buffer -contains 0) + } + finally { + $stream.Dispose() + } + } + catch { return $false } +} + +# --- Mutation --------------------------------------------------------------- +function Remove-Utf8BomInPlace { + # Write the existing buffer from offset 3, no extra full-size allocation. + param([Parameter(Mandatory)][string]$Path) + + $bytes = [System.IO.File]::ReadAllBytes($Path) + if ($bytes.Length -lt 3) { return $false } + + if ($bytes[0] -ne 0xEF -or + $bytes[1] -ne 0xBB -or + $bytes[2] -ne 0xBF) { + return $false + } + + $stream = [System.IO.File]::Open($Path,'Create','Write','ReadWrite') + try { + $stream.Write($bytes, 3, $bytes.Length - 3) + $stream.SetLength($bytes.Length - 3) + } + finally { + $stream.Dispose() + } + + return $true +} + +# --- Main ------------------------------------------------------------------- +Assert-InGitWorkTree +Assert-CleanWorkingTree + +$allFiles = Get-GitFilesUnderPwd + +$targets = $allFiles | % { + $fileName = [IO.Path]::GetFileName($_) + $ext = [IO.Path]::GetExtension($fileName) + + $isDot = $Dotfiles -contains $fileName + $isNoExt = -not $fileName.Contains('.') + + if ($isDot -or ($AllowedExts -contains $ext) -or + ($IncludeNoExt -and $isNoExt -and -not $isDot)) { + $_ + } +} +| ? { Test-HasUtf8Bom $_ } +| ? { -not (Test-ProbablyBinary $_) } + +$changed = 0 +$byExtension = @{} +$dotfileChanges = 0 + +$targets | % { + $relative = Resolve-Path -LiteralPath $_ -Relative + + if ($PSCmdlet.ShouldProcess($relative,'Strip UTF-8 BOM')) { + if (Remove-Utf8BomInPlace -Path $_) { + $changed++ + + $fileName = [IO.Path]::GetFileName($_) + if ($Dotfiles -contains $fileName) { $dotfileChanges++ } + + $ext = [IO.Path]::GetExtension($fileName) + if (-not $byExtension.ContainsKey($ext)) { $byExtension[$ext] = 0 } + $byExtension[$ext]++ + + "stripped BOM: $relative" + } + } +} + +"Done. Stripped BOM from $changed file(s)." + +if ($byExtension.Keys.Count -gt 0) { + "" + "By extension:" + $byExtension.GetEnumerator() | Sort-Object Name | % { + $key = if ([string]::IsNullOrEmpty($_.Name)) { '[noext]' } else { $_.Name } + " {0}: {1}" -f $key, $_.Value + } +} + +if ($dotfileChanges -gt 0) { + " [dotfiles]: $dotfileChanges" +} diff --git a/BuildTools/update-assemblyinfo.ps1 b/BuildTools/update-assemblyinfo.ps1 index 143ba2165..2bf3dd731 100644 --- a/BuildTools/update-assemblyinfo.ps1 +++ b/BuildTools/update-assemblyinfo.ps1 @@ -1,4 +1,4 @@ -if (-not ($PSVersionTable.PSCompatibleVersions -contains "5.0")) { +if (-not ($PSVersionTable.PSCompatibleVersions -contains "5.0")) { Write-Error "This script requires at least powershell version 5.0!"; return 255; } @@ -175,7 +175,8 @@ try { $out = $out.Replace('$INSERTBUILDCONFIG$', $buildConfig); if ((-not (Test-File $file.Output)) -or (((Get-Content $file.Output) -Join [System.Environment]::NewLine) -ne $out)) { - $out | Out-File -Encoding utf8 $file.Output; + $utf8NoBom = New-Object System.Text.UTF8Encoding($false); + [System.IO.File]::WriteAllText($file.Output, $out, $utf8NoBom); } }