diff --git a/Emulsion.Telegram/Funogram.fs b/Emulsion.Telegram/Funogram.fs index bfe3325e..00e93111 100644 --- a/Emulsion.Telegram/Funogram.fs +++ b/Emulsion.Telegram/Funogram.fs @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Emulsion contributors +// SPDX-FileCopyrightText: 2025 Emulsion contributors // // SPDX-License-Identifier: MIT @@ -105,16 +105,30 @@ module MessageConverter = pos <- linkEndOffset result.Append(text.Substring(pos, text.Length - pos)).ToString() - let private applyLimits limits text = + let private applyLimits limits (text: string) = let applyMessageLengthLimit (original: {| text: string; wasLimited: bool |}) = match limits.messageLengthLimit with | None -> original | Some limit when original.text.Length <= limit -> original | Some limit -> - let newText = original.text.Substring(0, - Math.Clamp(limit - limits.dataRedactedMessage.Length, - 0, - original.text.Length)) + assert (limit >= limits.dataRedactedMessage.Length) + + let mutable newTextLength = Math.Clamp( + limit - limits.dataRedactedMessage.Length, + 0, + original.text.Length + ) + + // We should never split surrogate pairs present in the initial message. So, if the message ends with a + // high part of such a pair, cut it more, to remove the part of the pair. + // + // Technically, this will also strip a part of an invalid Unicode sequence if the message originally + // contained such an orphan part of the pair without even following it by a high surrogate. But we don't + // care. + if newTextLength > 0 && Char.IsHighSurrogate(text[newTextLength - 1]) then + newTextLength <- newTextLength - 1 + + let newText = original.text.Substring(0, newTextLength) {| text = newText; wasLimited = true |} let applyLineLimit (original: {| text: string; wasLimited: bool |}) = diff --git a/Emulsion.Tests/Telegram/FunogramTests.fs b/Emulsion.Tests/Telegram/FunogramTests.fs index 277083bd..7f4636a9 100644 --- a/Emulsion.Tests/Telegram/FunogramTests.fs +++ b/Emulsion.Tests/Telegram/FunogramTests.fs @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Emulsion contributors +// SPDX-FileCopyrightText: 2025 Emulsion contributors // // SPDX-License-Identifier: MIT @@ -736,6 +736,21 @@ module FlattenMessageTests = flattenMessage replyMessage ) + [] + let ``Flattening should not split surrogate pairs``() = + let originalMessage = authoredTelegramMessage "@originalUser" "πŸ™πŸ™πŸ™πŸ™" + let limit = 6 + let replyMessage = authoredTelegramReplyMessage "@replyingUser" "Reply text" originalMessage.main + let flattener = MessageConverter.flatten { + MessageConverter.DefaultQuoteSettings with + limits.messageLengthLimit = Some limit + } + let flattened = flattener replyMessage + Assert.Equal( + Authored { author = "@replyingUser"; text = ">> <@originalUser> πŸ™[…]\n\nReply text" }, + flattened + ) + [] let flattenReplyEventMessage() = let originalMessage = eventTelegramMessage "@originalUser has entered the chat" diff --git a/Emulsion.Tests/Xmpp/SharpXmppHelperTests.fs b/Emulsion.Tests/Xmpp/SharpXmppHelperTests.fs index 9a4400e4..2bebeb48 100644 --- a/Emulsion.Tests/Xmpp/SharpXmppHelperTests.fs +++ b/Emulsion.Tests/Xmpp/SharpXmppHelperTests.fs @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Emulsion contributors +// SPDX-FileCopyrightText: 2025 Emulsion contributors // // SPDX-License-Identifier: MIT @@ -16,6 +16,22 @@ open Emulsion.Xmpp open Emulsion.Xmpp.SharpXmppHelper.Attributes open Emulsion.Xmpp.SharpXmppHelper.Elements +[] +let ``SanitizeXmlText processes emoji as-is``(): unit = + Assert.Equal("πŸ™", SharpXmppHelper.SanitizeXmlText "πŸ™") + Assert.Equal("testπŸ™", SharpXmppHelper.SanitizeXmlText "testπŸ™") + +[] +let ``SanitizeXmlText replaces parts of UTF-16 surrogate pair with the replacement char``(): unit = + let octopus = "πŸ™" + Assert.Equal(2, octopus.Length) + let firstHalf = string(octopus[0]) + let secondHalf = string(octopus[1]) + Assert.Equal("πŸ™", firstHalf + secondHalf) + Assert.Equal("οΏ½", SharpXmppHelper.SanitizeXmlText firstHalf) + Assert.Equal("οΏ½", SharpXmppHelper.SanitizeXmlText secondHalf) + Assert.Equal("testοΏ½", SharpXmppHelper.SanitizeXmlText $"test{secondHalf}") + [] let ``Message body has a proper namespace``() = let message = SharpXmppHelper.message "" "cthulhu@test" "text" diff --git a/Emulsion/Xmpp/SharpXmppHelper.fs b/Emulsion/Xmpp/SharpXmppHelper.fs index 7959e961..3370adc5 100644 --- a/Emulsion/Xmpp/SharpXmppHelper.fs +++ b/Emulsion/Xmpp/SharpXmppHelper.fs @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2024 Emulsion contributors +// SPDX-FileCopyrightText: 2025 Emulsion contributors // // SPDX-License-Identifier: MIT @@ -6,8 +6,11 @@ module Emulsion.Xmpp.SharpXmppHelper open System +open System.Buffers +open System.Text open System.Xml.Linq +open Microsoft.FSharp.NativeInterop open SharpXMPP open SharpXMPP.XMPP open SharpXMPP.XMPP.Client.Elements @@ -49,6 +52,28 @@ let private bookmark (roomJid: string) (nickname: string) (password: string opti room.Add(nickElement) room +#nowarn "9" // for NativePtr +let SanitizeXmlText(text: string): string = + let mutable hasError = false + let mutable span = text.AsSpan() + while not hasError && not span.IsEmpty do + let mutable rune = Rune() + let mutable consumed = 0 + if Rune.DecodeFromUtf16(span, &rune, &consumed) = OperationStatus.Done + then span <- span.Slice consumed + else hasError <- true + + if hasError then + let builder = StringBuilder() + for r in text.EnumerateRunes() do + let length = r.Utf16SequenceLength + let buf = Span(NativePtr.stackalloc length |> NativePtr.toVoidPtr, length) + r.EncodeToUtf16 buf |> ignore + builder.Append(buf) |> ignore + builder.ToString() + else + text + let joinRoom (client: XmppClient) (roomJid: string) (nickname: string) (password: string option): unit = let room = bookmark roomJid nickname password client.BookmarkManager.Join room @@ -59,7 +84,7 @@ let message (id: string) (toAddr: string) (text: string): XMPPMessage = m.SetAttributeValue(Type, "groupchat") m.SetAttributeValue(To, toAddr) let body = XElement(Body) - body.Value <- text + body.Value <- SanitizeXmlText text m.Add(body) m