Skip to content

Strings module

The strings module provides helpers for dealing with strings of text which could contain inline HTML. These are used for both source strings and translated strings.

StringValue

A fragment of HTML that only contains inline tags with all attributes stripped out.

Attributes:

Name Type Description
data str

The HTML fragment.

Source code in wagtail_localize/strings.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
class StringValue:
    """
    A fragment of HTML that only contains inline tags with all attributes stripped out.

    Attributes:
        data (str): The HTML fragment.
    """

    def __init__(self, data):
        self.data = data

    @classmethod
    def from_plaintext(cls, text):
        """
        Initialises a StringValue from a plain text string.

        Args:
            text (str): The plain text to turn into a StringValue.

        Returns:
            StringValue: The initialised StringValue.
        """
        # Escapes all HTML characters and replaces newlines with <br> tags
        elements = []

        for line in text.split("\n"):
            if line:
                elements.append(escape(line))

            elements.append("<br>")

        # Remove last element which is an extra br tag
        elements.pop()

        # Join the elements then pass through beautiful soup to normalize the HTML
        return cls(str(BeautifulSoup("".join(elements), "html.parser")))

    @classmethod
    def from_source_html(cls, html):
        """
        Initialises a StringValue from a HTML string.

        Source HTML is the HTML you get in Wagtail field data. This contains HTML attributes that
        must first be stripped out before the string can be translated.

        Args:
            html (str): The HTML to turn into a StringValue.

        Returns:
            tuple[StringValue, dict]: The initialised StringValue and a dictionary of extracted HTML attributes.
        """
        # Extracts attributes from any tags (eg, href from <a> tags) and stores a version
        # with just the translatable HTML
        soup = BeautifulSoup(html, "html.parser")
        attrs = {}
        counter = Counter()

        def walk(soup):
            for element in soup.children:
                if isinstance(element, NavigableString):
                    pass

                else:
                    # Extract HTML attributes replacing them with an ID
                    if element.attrs:
                        counter[element.name] += 1
                        element_id = element.name + str(counter[element.name])
                        attrs[element_id] = element.attrs
                        element.attrs = {"id": element_id}

                    # Traverse into element children
                    walk(element)

        walk(soup)

        validate_element(soup)

        return cls(str(soup)), attrs

    @classmethod
    def from_translated_html(cls, html):
        """
        Initialises a StringValue from a HTML string.

        HTML attributes are stripped out before translation, so translated HTML does not
        need to have them stripped out.

        Args:
            html (str): The HTML to turn into a StringValue.

        Returns:
            StringValue: The initialised StringValue.
        """
        soup = BeautifulSoup(html, "html.parser")

        validate_element(soup)

        return cls(str(soup))

    def render_text(self):
        """
        Returns a plain text representation of the string.

        Note: If the string was initialised from HTML, all HTML tags will be stripped out.

        Returns:
            str: The plain text representation of the string.
        """
        soup = BeautifulSoup(self.data, "html.parser")
        texts = []

        def walk(soup):
            for element in soup.children:
                if isinstance(element, NavigableString):
                    texts.append(element)

                elif element.name == "br":
                    texts.append("\n")

                else:
                    walk(element)

        walk(soup)

        return "".join(texts)

    def render_soup(self, attrs):
        """
        Returns a BeautifulSoup instance containing the string.

        This is equivalent to: ``BeautifulSoup(string.render_html(attrs), "html.parser")``

        The .render_html() method calls this internally so it would be more performant to call this directly if a
        BeautifulSoup object is what you need.

        Returns:
            BeautifulSoup: A BeautifulSoup object representing the HTML of the string.
        """
        soup = BeautifulSoup(self.data, "html.parser")

        def walk(soup):
            for element in soup.children:
                if isinstance(element, NavigableString):
                    pass

                else:
                    # Restore HTML attributes
                    if "id" in element.attrs:
                        element.attrs = attrs[element.attrs["id"]]

                    # Traverse into element children
                    walk(element)

        walk(soup)

        return soup

    def render_html(self, attrs):
        """
        Returns a HTML representation of the string.

        Note: If the string was initialised from plain text, all special characters will be escaped.

        Returns:
            str: The HTML representation of the string.
        """
        return str(self.render_soup(attrs))

    def get_translatable_html(self):
        """
        Returns a HTML string without restoring any HTML attributes.

        Note: If the string was initialised from plain text, all special characters will be escaped.

        Returns:
            str: The HTML representation of the string without HTML attributes
        """
        return self.data

    def __eq__(self, other):
        return isinstance(other, StringValue) and other.data == self.data

    def __repr__(self):
        return f"<StringValue '{self.data}'>"

    def __hash__(self):
        return hash(self.data)

from_plaintext(text) classmethod

Initialises a StringValue from a plain text string.

Parameters:

Name Type Description Default
text str

The plain text to turn into a StringValue.

required

Returns:

Name Type Description
StringValue

The initialised StringValue.

Source code in wagtail_localize/strings.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@classmethod
def from_plaintext(cls, text):
    """
    Initialises a StringValue from a plain text string.

    Args:
        text (str): The plain text to turn into a StringValue.

    Returns:
        StringValue: The initialised StringValue.
    """
    # Escapes all HTML characters and replaces newlines with <br> tags
    elements = []

    for line in text.split("\n"):
        if line:
            elements.append(escape(line))

        elements.append("<br>")

    # Remove last element which is an extra br tag
    elements.pop()

    # Join the elements then pass through beautiful soup to normalize the HTML
    return cls(str(BeautifulSoup("".join(elements), "html.parser")))

from_source_html(html) classmethod

Initialises a StringValue from a HTML string.

Source HTML is the HTML you get in Wagtail field data. This contains HTML attributes that must first be stripped out before the string can be translated.

Parameters:

Name Type Description Default
html str

The HTML to turn into a StringValue.

required

Returns:

Type Description

tuple[StringValue, dict]: The initialised StringValue and a dictionary of extracted HTML attributes.

Source code in wagtail_localize/strings.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
@classmethod
def from_source_html(cls, html):
    """
    Initialises a StringValue from a HTML string.

    Source HTML is the HTML you get in Wagtail field data. This contains HTML attributes that
    must first be stripped out before the string can be translated.

    Args:
        html (str): The HTML to turn into a StringValue.

    Returns:
        tuple[StringValue, dict]: The initialised StringValue and a dictionary of extracted HTML attributes.
    """
    # Extracts attributes from any tags (eg, href from <a> tags) and stores a version
    # with just the translatable HTML
    soup = BeautifulSoup(html, "html.parser")
    attrs = {}
    counter = Counter()

    def walk(soup):
        for element in soup.children:
            if isinstance(element, NavigableString):
                pass

            else:
                # Extract HTML attributes replacing them with an ID
                if element.attrs:
                    counter[element.name] += 1
                    element_id = element.name + str(counter[element.name])
                    attrs[element_id] = element.attrs
                    element.attrs = {"id": element_id}

                # Traverse into element children
                walk(element)

    walk(soup)

    validate_element(soup)

    return cls(str(soup)), attrs

from_translated_html(html) classmethod

Initialises a StringValue from a HTML string.

HTML attributes are stripped out before translation, so translated HTML does not need to have them stripped out.

Parameters:

Name Type Description Default
html str

The HTML to turn into a StringValue.

required

Returns:

Name Type Description
StringValue

The initialised StringValue.

Source code in wagtail_localize/strings.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
@classmethod
def from_translated_html(cls, html):
    """
    Initialises a StringValue from a HTML string.

    HTML attributes are stripped out before translation, so translated HTML does not
    need to have them stripped out.

    Args:
        html (str): The HTML to turn into a StringValue.

    Returns:
        StringValue: The initialised StringValue.
    """
    soup = BeautifulSoup(html, "html.parser")

    validate_element(soup)

    return cls(str(soup))

get_translatable_html()

Returns a HTML string without restoring any HTML attributes.

Note: If the string was initialised from plain text, all special characters will be escaped.

Returns:

Name Type Description
str

The HTML representation of the string without HTML attributes

Source code in wagtail_localize/strings.py
236
237
238
239
240
241
242
243
244
245
def get_translatable_html(self):
    """
    Returns a HTML string without restoring any HTML attributes.

    Note: If the string was initialised from plain text, all special characters will be escaped.

    Returns:
        str: The HTML representation of the string without HTML attributes
    """
    return self.data

render_html(attrs)

Returns a HTML representation of the string.

Note: If the string was initialised from plain text, all special characters will be escaped.

Returns:

Name Type Description
str

The HTML representation of the string.

Source code in wagtail_localize/strings.py
225
226
227
228
229
230
231
232
233
234
def render_html(self, attrs):
    """
    Returns a HTML representation of the string.

    Note: If the string was initialised from plain text, all special characters will be escaped.

    Returns:
        str: The HTML representation of the string.
    """
    return str(self.render_soup(attrs))

render_soup(attrs)

Returns a BeautifulSoup instance containing the string.

This is equivalent to: BeautifulSoup(string.render_html(attrs), "html.parser")

The .render_html() method calls this internally so it would be more performant to call this directly if a BeautifulSoup object is what you need.

Returns:

Name Type Description
BeautifulSoup

A BeautifulSoup object representing the HTML of the string.

Source code in wagtail_localize/strings.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
def render_soup(self, attrs):
    """
    Returns a BeautifulSoup instance containing the string.

    This is equivalent to: ``BeautifulSoup(string.render_html(attrs), "html.parser")``

    The .render_html() method calls this internally so it would be more performant to call this directly if a
    BeautifulSoup object is what you need.

    Returns:
        BeautifulSoup: A BeautifulSoup object representing the HTML of the string.
    """
    soup = BeautifulSoup(self.data, "html.parser")

    def walk(soup):
        for element in soup.children:
            if isinstance(element, NavigableString):
                pass

            else:
                # Restore HTML attributes
                if "id" in element.attrs:
                    element.attrs = attrs[element.attrs["id"]]

                # Traverse into element children
                walk(element)

    walk(soup)

    return soup

render_text()

Returns a plain text representation of the string.

Note: If the string was initialised from HTML, all HTML tags will be stripped out.

Returns:

Name Type Description
str

The plain text representation of the string.

Source code in wagtail_localize/strings.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def render_text(self):
    """
    Returns a plain text representation of the string.

    Note: If the string was initialised from HTML, all HTML tags will be stripped out.

    Returns:
        str: The plain text representation of the string.
    """
    soup = BeautifulSoup(self.data, "html.parser")
    texts = []

    def walk(soup):
        for element in soup.children:
            if isinstance(element, NavigableString):
                texts.append(element)

            elif element.name == "br":
                texts.append("\n")

            else:
                walk(element)

    walk(soup)

    return "".join(texts)

extract_ids(template)

Extract link ids from one template string and return it in a set.

Source code in wagtail_localize/strings.py
486
487
488
489
490
491
492
493
494
495
496
497
498
def extract_ids(template):
    """Extract link ids from one template string and return it in a set."""
    soup = BeautifulSoup(template, "html.parser")
    ids = set()
    for element in soup.descendants:
        if not isinstance(element, Tag):
            continue

        if element.name == "a":
            if "id" in element.attrs:
                ids.add(element.attrs["id"])

    return ids

extract_strings(html)

This function extracts translatable strings from an HTML fragment.

Inline elements and visible text are extracted together.

This also returns a list of hrefs that were found in the HTML, these are also included in the strings.

For example:

<h1>Foo</h1>
<p>
    Bar
    <ul>
        <li><b>Baz</b></li>
    </ul>
    <a href="http://example.com">A link</a>
</p>

Will produce the following two outputs (as a 3-tuple)

<h1><text position="0"></h1>
<p>
    <text position="1">
    <ul>
        <li><text position="2"></li>
    </ul>
</p>

[
    "Foo",
    "Bar",
    "<b>Baz</b>",
    "<a href="http://example.com">A link</a>"
]

Parameters:

Name Type Description Default
html str

The HTML to extract strings from.

required

Returns:

Type Description

tuple[str, list[tuple[StringValue, dict]]]: Returns a template string, and list 2-tuples containing a StringValue and dict of HTML attribute

Source code in wagtail_localize/strings.py
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
def extract_strings(html):
    """
    This function extracts translatable strings from an HTML fragment.

    Inline elements and visible text are extracted together.

    This also returns a list of hrefs that were found in the HTML, these are also included in the strings.

    For example:

        <h1>Foo</h1>
        <p>
            Bar
            <ul>
                <li><b>Baz</b></li>
            </ul>
            <a href="http://example.com">A link</a>
        </p>

    Will produce the following two outputs (as a 3-tuple)

        <h1><text position="0"></h1>
        <p>
            <text position="1">
            <ul>
                <li><text position="2"></li>
            </ul>
        </p>

        [
            "Foo",
            "Bar",
            "<b>Baz</b>",
            "<a href="http://example.com">A link</a>"
        ]

    Args:
        html (str): The HTML to extract strings from.

    Returns:
        tuple[str, list[tuple[StringValue, dict]]]: Returns a template string, and list 2-tuples containing a
            StringValue and dict of HTML attribute
    """
    if html is None:
        html = ""

    soup = BeautifulSoup(html, "html.parser")

    def wrap(elements):
        """
        Wraps the given elements with a <text> tag

        The elements must be contiguous siblings or this might screw up the tree.
        """
        elements = list(elements)

        # Skip if there are no tags to wrap
        # We can get here after filters below have been applied
        if len(elements) == 0:
            return

        # If there is a single element and that is an inline tag, wrap just the contents.
        # We only care about inline tags that wrap only part of a segment
        if (
            len(elements) == 1
            and not isinstance(elements[0], NavigableString)
            and elements[0].name != "a"  # keep href translatable
            and elements[0].name in INLINE_TAGS
        ):
            wrap(elements[0].children)
            return

        def ignore_if_at_end(element):
            """
            Returns True if the given element should be ignored if it is at one of the ends
            """
            if isinstance(element, NavigableString):
                return False

            # Ignore if there are no text nodes
            # This will exclude both <br> tags and empty inline tags
            if not any(
                isinstance(desc, NavigableString) for desc in element.descendants
            ):
                return True

            return False

        if ignore_if_at_end(elements[0]):
            wrap(elements[1:])
            return

        if ignore_if_at_end(elements[-1]):
            wrap(elements[:-1])
            return

        value = "".join(
            element.output_ready()
            if isinstance(element, NavigableString)
            else str(element)
            for element in elements
        )

        if value and not value.isspace():
            # Create <text> tag
            elements[0].insert_before(soup.new_tag("text", value=value))

            # Remove elements
            for element in elements:
                element.replaceWith("")

    def walk(element):
        """
        Walks the tree in depth first search post-order.

        When it encounters an element that could be extracted, it wraps it with
        a <text> tag. These are extracted in the next stage (because we want to
        preserve order of occurance).

        For example:

        <p>
            Foo
            <ul>
              <li>Bar</li>
            </ul>
            Baz
        </p>

        Is transformed to:

        <p>
            <text>Foo</text>
            <ul>
              <li><text><b>Bar</b></text></li>
            </ul>
            <text>Baz</text>
        </p>
        """
        if isinstance(element, NavigableString):
            return False, False

        has_block = False
        has_wrap = False
        buffer = []

        for child in element.children:
            child_has_wrap, is_block = walk(child)

            if child_has_wrap:
                has_wrap = True

            if is_block:
                has_block = True

                if buffer:
                    wrap(buffer)
                    buffer = []
                    has_wrap = True

            else:
                if not child_has_wrap:
                    buffer.append(child)

        if buffer and has_block:
            wrap(buffer)
            buffer = []
            has_wrap = True

        if element.name not in INLINE_TAGS:
            if buffer:
                wrap(buffer)
                has_wrap = True

            return has_wrap, True

        return has_wrap, False

    walk(soup)

    # Now extract strings from the <text> tags
    strings = []
    position = 0
    for element in soup.descendants:
        if element.name == "text":
            text = element.attrs.pop("value")

            # Strip leading and trailing whitespace. We keep the values and reinsert them
            # into the template
            # This is probably not necessary, but just to be on the safe side
            text, prefix = lstrip_keep(text)
            text, suffix = rstrip_keep(text)

            element.attrs["position"] = position
            position += 1
            string_val, attrs = StringValue.from_source_html(text)
            strings.append((string_val, attrs))

            if prefix:
                element.insert_before(prefix)

            if suffix:
                element.insert_after(suffix)

    return str(soup), strings

lstrip_keep(text)

Like lstrip, but also returns the whitespace that was stripped off

Source code in wagtail_localize/strings.py
12
13
14
15
16
17
18
19
def lstrip_keep(text):
    """
    Like lstrip, but also returns the whitespace that was stripped off
    """
    text_length = len(text)
    new_text = text.lstrip()
    prefix = text[0 : (text_length - len(new_text))]
    return new_text, prefix

restore_strings(template, strings)

Inserts a list of strings into the template.

This reverses the extract_strings function.

Parameters:

Name Type Description Default
template str

The HTML template.

required
strings list[tuple[StringValue, dict]]

A list of 2-tuples containing a StringValue and HTML attributes dict for each string to reinsert into the template.

required

Returns:

Name Type Description
str

A HTML blob with the strings inserted into the template.

Source code in wagtail_localize/strings.py
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
def restore_strings(template, strings):
    """
    Inserts a list of strings into the template.

    This reverses the `extract_strings` function.

    Args:
        template (str): The HTML template.
        strings (list[tuple[StringValue, dict]]): A list of 2-tuples containing a StringValue and HTML attributes dict
            for each string to reinsert into the template.

    Returns:
        str: A HTML blob with the strings inserted into the template.
    """
    soup = BeautifulSoup(template, "html.parser")
    for text_element in soup.findAll("text"):
        string, attrs = strings[int(text_element.get("position"))]
        text_element.replaceWith(string.render_soup(attrs))

    return str(soup)

rstrip_keep(text)

Like rstrip, but also returns the whitespace that was stripped off

Source code in wagtail_localize/strings.py
22
23
24
25
26
27
28
29
30
31
32
def rstrip_keep(text):
    """
    Like rstrip, but also returns the whitespace that was stripped off
    """
    text_length = len(text)
    new_text = text.rstrip()
    if text_length != len(new_text):
        suffix = text[-(text_length - len(new_text)) :]
    else:
        suffix = ""
    return new_text, suffix

validate_element(element)

Checks the given BeautifulSoup element for anything that we disallow from strings.

Source code in wagtail_localize/strings.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def validate_element(element):
    """
    Checks the given BeautifulSoup element for anything that we disallow from strings.
    """
    if isinstance(element, NavigableString):
        return

    # Validate tag and attributes
    if isinstance(element, Tag) and element.name != "[document]":
        # Block tags are not allowed in strings
        if element.name not in INLINE_TAGS:
            raise ValueError(
                _(
                    "<{}> tag is not allowed. Strings can only contain standard HTML inline tags (such as <b>, <a>)"
                ).format(element.name)
            )

        # Elements can't have attributes, except for <a> tags
        keys = set(element.attrs.keys())
        if element.name == "a" and "id" in keys:
            keys.remove("id")
        if keys:
            raise ValueError(
                _(
                    "Strings cannot have any HTML tags with attributes (except for 'id' in <a> tags)"
                )
            )

    # Traverse children
    for child_element in element.children:
        validate_element(child_element)

Check that the link id in a translation are present in its source.

Source code in wagtail_localize/strings.py
501
502
503
504
505
506
507
def validate_translation_links(translation_of, data):
    """Check that the link id in a translation are present in its source."""
    id1s, id2s = extract_ids(translation_of), extract_ids(data)
    new_ids = id2s - id1s
    if new_ids:
        ids = ", ".join(sorted(new_ids))
        raise ValueError(_("Unrecognised id found in an <a> tag: {}").format(ids))