htmlsanitizer

package module
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 26, 2026 License: MIT Imports: 5 Imported by: 14

README

htmlsanitizer

Go Reference Go codecov

A fast, allowlist-based HTML sanitizer written in Go. Secure-by-default with a built-in allowlist that strips dangerous HTML content.

  • Fast -- O(n) time complexity via an internal Finite State Machine
  • Customizable -- modify the allowlist, add/remove tags, or disable all HTML
  • Zero dependencies

Also available in Rust / npm: htmlsanitizer-rs

Install

go get github.com/sym01/htmlsanitizer

Usage

Basic
sanitizedHTML, err := htmlsanitizer.SanitizeString(rawHTML)
Disable the id attribute globally
s := htmlsanitizer.NewHTMLSanitizer()
s.GlobalAttr = []string{"class"}

sanitizedHTML, err := s.SanitizeString(rawHTML)
Add or remove tags
s := htmlsanitizer.NewHTMLSanitizer()
// remove <a> tag
s.RemoveTag("a")

// add a custom tag
s.AllowList.Tags = append(s.AllowList.Tags, &htmlsanitizer.Tag{
    Name: "my-tag",
    Attr: []string{"my-attr"},
})

sanitizedHTML, err := s.SanitizeString(rawHTML)
Strip all HTML
s := htmlsanitizer.NewHTMLSanitizer()
s.AllowList = nil

sanitizedHTML, err := s.SanitizeString(rawHTML)

Testing

go test ./...              # run tests
go test -race ./...        # with race detection
go test -bench=. -benchmem ./...  # benchmarks
go test -fuzz=FuzzSanitize -fuzztime=30s .  # fuzz testing

Documentation

Index

Examples

Constants

This section is empty.

Variables

View Source
var DefaultAllowList = &AllowList{
	Tags: []*Tag{
		{"address", []string{}, []string{}},
		{"article", []string{}, []string{}},
		{"aside", []string{}, []string{}},
		{"footer", []string{}, []string{}},
		{"header", []string{}, []string{}},
		{"h1", []string{}, []string{}},
		{"h2", []string{}, []string{}},
		{"h3", []string{}, []string{}},
		{"h4", []string{}, []string{}},
		{"h5", []string{}, []string{}},
		{"h6", []string{}, []string{}},
		{"hgroup", []string{}, []string{}},
		{"main", []string{}, []string{}},
		{"nav", []string{}, []string{}},
		{"section", []string{}, []string{}},
		{"blockquote", []string{}, []string{"cite"}},
		{"dd", []string{}, []string{}},
		{"div", []string{}, []string{}},
		{"dl", []string{}, []string{}},
		{"dt", []string{}, []string{}},
		{"figcaption", []string{}, []string{}},
		{"figure", []string{}, []string{}},
		{"hr", []string{}, []string{}},
		{"li", []string{}, []string{}},
		{"ol", []string{}, []string{}},
		{"p", []string{}, []string{}},
		{"pre", []string{}, []string{}},
		{"ul", []string{}, []string{}},
		{"a", []string{"rel", "target", "referrerpolicy"}, []string{"href"}},
		{"abbr", []string{"title"}, []string{}},
		{"b", []string{}, []string{}},
		{"bdi", []string{}, []string{}},
		{"bdo", []string{}, []string{}},
		{"br", []string{}, []string{}},
		{"cite", []string{}, []string{}},
		{"code", []string{}, []string{}},
		{"data", []string{"value"}, []string{}},
		{"em", []string{}, []string{}},
		{"i", []string{}, []string{}},
		{"kbd", []string{}, []string{}},
		{"mark", []string{}, []string{}},
		{"q", []string{}, []string{"cite"}},
		{"s", []string{}, []string{}},
		{"small", []string{}, []string{}},
		{"span", []string{}, []string{}},
		{"strong", []string{}, []string{}},
		{"sub", []string{}, []string{}},
		{"sup", []string{}, []string{}},
		{"time", []string{"datetime"}, []string{}},
		{"u", []string{}, []string{}},
		{"area", []string{"alt", "coords", "shape", "target", "rel", "referrerpolicy"}, []string{"href"}},
		{"audio", []string{"autoplay", "controls", "crossorigin", "duration", "loop", "muted", "preload"}, []string{"src"}},
		{"img", []string{"alt", "crossorigin", "height", "width", "loading", "referrerpolicy"}, []string{"src"}},
		{"map", []string{"name"}, []string{}},
		{"track", []string{"default", "kind", "label", "srclang"}, []string{"src"}},
		{"video", []string{"autoplay", "buffered", "controls", "crossorigin", "duration", "loop", "muted", "preload", "height", "width"}, []string{"src", "poster"}},

		{"picture", []string{}, []string{}},
		{"source", []string{"type"}, []string{"src"}},

		{"del", []string{}, []string{}},
		{"ins", []string{}, []string{}},
		{"caption", []string{}, []string{}},
		{"col", []string{"span"}, []string{}},
		{"colgroup", []string{}, []string{}},
		{"table", []string{}, []string{}},
		{"tbody", []string{}, []string{}},
		{"td", []string{"colspan", "rowspan"}, []string{}},
		{"tfoot", []string{}, []string{}},
		{"th", []string{"colspan", "rowspan", "scope"}, []string{}},
		{"thead", []string{}, []string{}},
		{"tr", []string{}, []string{}},

		{"details", []string{"open"}, []string{}},
		{"summary", []string{}, []string{}},
	},
	GlobalAttr: []string{
		"class",
		"id",
	},
	NonHTMLTags: []*Tag{
		{Name: "script"},
		{Name: "style"},
		{Name: "object"},
	},
}

DefaultAllowList for HTML filter.

The allowlist contains most tags listed in https://developer.mozilla.org/en-US/docs/Web/HTML/Element . It is not recommended to modify the default list directly, use .Clone() and then modify the new one instead.

Functions

func DefaultURLSanitizer

func DefaultURLSanitizer(rawURL string) (sanitized string, ok bool)

DefaultURLSanitizer is a default and strict sanitizer. It only accepts

  • URL with scheme http or https
  • relative URL, such as abc, abc?xxx=1, abc#123
  • absolute URL, such as /abc, /abc?xxx=1, /abc#123

func NewWriter

func NewWriter(w io.Writer) io.Writer

NewWriter returns a new Writer, with DefaultAllowList, writing sanitized HTML content to w.

Example
package main

import (
	"bytes"
	"fmt"
	"io"
	"strings"

	"github.com/sym01/htmlsanitizer"
)

func main() {
	// demo data
	data := strings.Repeat(`abc-->
<a href="javascript:alert(1)">link1</a>
<a href=http://example.com>link2<script>xxx</script></a>
<!--`, 1024)
	expected := "abc--&gt;" + strings.Repeat(`
<a>link1</a>
<a href="http://example.com">link2</a>
`, 1024)

	// underlying writer for demo
	o := new(bytes.Buffer)

	// source reader for demo
	r := bytes.NewBufferString(data)

	sanitizedWriter := htmlsanitizer.NewWriter(o)
	_, _ = io.Copy(sanitizedWriter, r)

	// check the result, for demo only
	fmt.Print(o.String() == expected)
}
Output:
true

func Sanitize

func Sanitize(data []byte) ([]byte, error)

Sanitize uses the DefaultAllowList to sanitize the HTML data.

func SanitizeString

func SanitizeString(data string) (string, error)

SanitizeString uses the DefaultAllowList to sanitize the HTML string.

Types

type AllowList added in v1.0.1

type AllowList struct {
	// Tags specifies all the allow tags.
	Tags []*Tag

	// GlobalAttr specifies the allowed attributes for all the tag.
	// It's very useful for some common attributes, such as `class`, `id`.
	// For security reasons, it's not recommended to set a global attr for
	// any URL-related attribute.
	GlobalAttr []string

	// NonHTMLTags defines a set of special tags, such as <script> and <style>.
	// The content of these kind of tags is actually not a real HTML content.
	// So we should treat it as a single element, without any child elements.
	// TODO: rename this one
	NonHTMLTags []*Tag
	// contains filtered or unexported fields
}

AllowList specifies all the allowed HTML tags and its attributes for the sanitizer.

func (*AllowList) Clone added in v1.0.1

func (l *AllowList) Clone() *AllowList

Clone a new AllowList. Tags and NonHTMLTags are deep-copied so that mutating the clone does not affect the original.

func (*AllowList) FindTag added in v1.0.1

func (l *AllowList) FindTag(name string) *Tag

FindTag finds and returns tag by its name. The name parameter must already be lowercased.

func (*AllowList) RemoveTag added in v1.0.1

func (l *AllowList) RemoveTag(name string)

RemoveTag removes all tags name `name`, must be lowercase It is not recommended to modify the default list directly, use .Clone() and then modify the new one instead.

Example
package main

import (
	"fmt"

	"github.com/sym01/htmlsanitizer"
)

func main() {
	// sometimes we don't want user to pass HTML with <a> tag
	sanitizer := htmlsanitizer.NewHTMLSanitizer()
	sanitizer.RemoveTag("a")

	data := `
<h1 ClaSs="h1">hello</h1>
<p>
	Hello, world<br>
	Welcome to use <a href="https://github.com/sym01/htmlsanitizer">htmlsanitizer</a>
</p>`
	output, _ := sanitizer.SanitizeString(data)
	fmt.Print(output)
}
Output:

<h1 class="h1">hello</h1>
<p>
	Hello, world<br>
	Welcome to use htmlsanitizer
</p>

type HTMLSanitizer

type HTMLSanitizer struct {
	*AllowList

	// URLSanitizer is a func used to sanitize all the URLAttr.
	// URLSanitizer returns a sanitized URL and a bool var indicating
	// whether the current attribute is acceptable. If not acceptable,
	// the current attribute will be ignored.
	// If the func is nil, then DefaultURLSanitizer will be used.
	URLSanitizer func(rawURL string) (sanitized string, ok bool)
}

HTMLSanitizer is a super fast HTML sanitizer for arbitrary HTML content. This is an allowlist-based sanitizer, of which the time complexity is O(n).

Example (CustomURLSanitizer)
package main

import (
	"fmt"
	"net/url"

	"github.com/sym01/htmlsanitizer"
)

func main() {
	// only links with domain name example.com are allowed.
	sanitizer := htmlsanitizer.NewHTMLSanitizer()
	sanitizer.URLSanitizer = func(rawURL string) (newURL string, ok bool) {
		newURL, ok = htmlsanitizer.DefaultURLSanitizer(rawURL)
		if !ok {
			return
		}

		u, err := url.Parse(newURL)
		if err != nil {
			ok = false
			return
		}

		if u.Host == "example.com" {
			ok = true
			return
		}
		ok = false
		return
	}

	data := `
<a href="http://others.com">Link</a>
<a href="https://example.com/xxx">Link with example.com</a>
	`
	output, _ := sanitizer.SanitizeString(data)
	fmt.Print(output)
}
Output:

<a>Link</a>
<a href="https://example.com/xxx">Link with example.com</a>
Example (KeepStyleSheet)
package main

import (
	"fmt"

	"github.com/sym01/htmlsanitizer"
)

func main() {
	sanitizer := htmlsanitizer.NewHTMLSanitizer()
	sanitizer.AllowList.Tags = append(sanitizer.AllowList.Tags,
		&htmlsanitizer.Tag{Name: "style"},
		&htmlsanitizer.Tag{Name: "head"},
		&htmlsanitizer.Tag{Name: "body"},
		&htmlsanitizer.Tag{Name: "html"},
	)

	data := `<!doctype html>
<html>
<head>
	<style type="text/css">
	body {
		background-color: #f0f0f2;
		margin: 0;
		padding: 0;
		bad-attr: <body></body>;
		bad-attr: <body></body >;
		bad-attr: <body></ body>;
		font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
	}
	</style>
</head>
<body>
	<div>
	<h1>Example Domain</h1>
	<p><a href="https://www.iana.org/domains/example">More information...</a></p>
	</div>
</body>
</html>`
	output, _ := sanitizer.SanitizeString(data)
	fmt.Print(output)
}
Output:

<html>
<head>
	<style>
	body {
		background-color: #f0f0f2;
		margin: 0;
		padding: 0;
		bad-attr: &lt;body&gt;&lt;/body&gt;;
		bad-attr: &lt;body&gt;&lt;/body &gt;;
		bad-attr: &lt;body&gt;&lt;/ body&gt;;
		font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
	}
	</style>
</head>
<body>
	<div>
	<h1>Example Domain</h1>
	<p><a href="https://www.iana.org/domains/example">More information...</a></p>
	</div>
</body>
</html>
Example (NoTagsAllowed)
package main

import (
	"fmt"

	"github.com/sym01/htmlsanitizer"
)

func main() {
	sanitizer := htmlsanitizer.NewHTMLSanitizer()
	// just set AllowList to nil to disable all tags
	sanitizer.AllowList = nil

	// of course nothing will happen here
	sanitizer.RemoveTag("a")

	data := `
<a href="http://others.com">Link</a>
<a href="https://example.com/xxx">Link with example.com</a>
	`
	output, _ := sanitizer.SanitizeString(data)
	fmt.Print(output)
}
Output:

Link
Link with example.com
Example (OnlyAllowHrefTag)
package main

import (
	"fmt"

	"github.com/sym01/htmlsanitizer"
)

func main() {
	sanitizer := htmlsanitizer.NewHTMLSanitizer()
	sanitizer.AllowList.Tags = []*htmlsanitizer.Tag{
		{"a", nil, []string{"href"}},
	}

	data := `
<details/open/ontoggle=alert(1)></details>
<a href="http://others.com" target="_blank">Link</a>
<a href="https://example.com/xxx">Link with example.com</a>
	`
	output, _ := sanitizer.SanitizeString(data)
	fmt.Print(output)
}
Output:

<a href="http://others.com">Link</a>
<a href="https://example.com/xxx">Link with example.com</a>

func NewHTMLSanitizer

func NewHTMLSanitizer() *HTMLSanitizer

NewHTMLSanitizer creates a new HTMLSanitizer with the clone of the DefaultAllowList.

func (*HTMLSanitizer) NewWriter

func (f *HTMLSanitizer) NewWriter(w io.Writer) io.Writer

NewWriter returns a new Writer writing sanitized HTML content to w.

func (*HTMLSanitizer) Sanitize

func (f *HTMLSanitizer) Sanitize(data []byte) ([]byte, error)

Sanitize the HTML data and return the sanitized HTML.

func (*HTMLSanitizer) SanitizeString

func (f *HTMLSanitizer) SanitizeString(data string) (string, error)

SanitizeString sanitizes the HTML string and return the sanitized HTML.

type Tag

type Tag struct {
	// Name for current tag, must be lowercase.
	Name string

	// Attr specifies the allowed attributes for current tag,
	// must be lowercase.
	//
	// e.g. colspan, rowspan
	Attr []string

	// URLAttr specifies the allowed, URL-related attributes for current tag,
	// must be lowercase.
	//
	// e.g. src, href
	URLAttr []string
}

Tag with its attributes.

Directories

Path Synopsis
cmd
htmlsanitizer command

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL