Regular Expression Performance in Go
Optimize Go regexp usage with RE2 engine characteristics, compile patterns efficiently, and know when to avoid regexp
Regular expressions are powerful but can become performance bottlenecks if used incorrectly. This guide covers Go's RE2 engine, optimization techniques, and when to avoid regexp entirely.
RE2 Engine: Guaranteed Linear Time
Go uses the RE2 engine (designed by Google) which guarantees linear-time matching. Unlike PCRE, it prevents catastrophic backtracking at the cost of some features.
RE2 Guarantees
// RE2 properties:
// - O(n) time complexity where n = input length
// - No exponential backtracking possible
// - No backreferences or lookahead/lookbehind
// - Predictable performance
// This is safe even with adversarial input
func SafeRegexpMatch(pattern, text string) (bool, error) {
re, err := regexp.Compile(pattern)
if err != nil {
return false, err
}
// Guaranteed to complete in reasonable time
return re.MatchString(text), nil
}
// Example: This pattern is safe in Go (would be ReDoS in JavaScript)
func ReDoSSafeInGo() {
// In JavaScript: "aaaaaaaaaaaaaaaaaaaaaaaaa..." hangs
// In Go: Completes instantly (linear time)
re, _ := regexp.Compile(`(a+)+b`)
re.MatchString("aaaaaaaaaaaaaaaaaaaaaaaa") // Fast
}regexp.Compile vs regexp.MustCompile
Use Compile for production code, MustCompile only when pattern is known at compile time.
// WRONG: Compiling in hot path (happens every request)
func ExtractIDWrong(text string) string {
re, _ := regexp.Compile(`\d+`) // Recompiled 1000x times
matches := re.FindString(text)
return matches
}
// CORRECT: Compile once, reuse
var idRegexp = regexp.MustCompile(`\d+`)
func ExtractIDRight(text string) string {
return idRegexp.FindString(text)
}
// PRODUCTION: Handle errors with Compile
var idRegexp *regexp.Regexp
func init() {
var err error
idRegexp, err = regexp.Compile(`\d+`)
if err != nil {
panic(err) // Pattern is static, should never fail
}
}
// Benchmark: Cost of recompilation
func BenchmarkCompileVsReuse(b *testing.B) {
text := "The answer is 42"
b.Run("compile_each_time", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
re, _ := regexp.Compile(`\d+`)
re.FindString(text)
}
})
b.Run("reuse_compiled", func(b *testing.B) {
re := regexp.MustCompile(`\d+`)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.FindString(text)
}
})
}
// Result: Reuse is 100x+ fasterCritical Rule: Compile once as package variable, use forever.
regexp.Compile vs regexp.CompilePOSIX
POSIX semantics have different matching behavior with potential performance impact:
func BenchmarkCompileVsPOSIX(b *testing.B) {
text := "hello world"
b.Run("standard", func(b *testing.B) {
re := regexp.MustCompile(`h.*o`)
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
b.Run("posix", func(b *testing.B) {
re := regexp.MustCompilePOSIX(`h.*o`)
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
}
// Difference: Standard = greedy, POSIX = leftmost-longest
// Practical impact: Usually minimalChoose standard unless you need specific POSIX behavior.
regexp.Match vs Compiled.Match: Performance Difference
The difference between these approaches is dramatic:
func BenchmarkRegexpMatch(b *testing.B) {
pattern := `\d+`
text := "The answer is 42"
b.Run("regexp.Match", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
regexp.Match(pattern, []byte(text))
}
})
b.Run("compiled.MatchString", func(b *testing.B) {
re := regexp.MustCompile(pattern)
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
}
// Result: regexp.Match is 100-200x slower!
// Because it recompiles the pattern every callNever use regexp.Match(); always compile first.
Avoiding Regexp: Simple Pattern Alternatives
Many common tasks are faster without regexp:
import (
"strings"
"unicode"
)
// Task: Check for digits
// SLOW
func HasDigitsRegexp(s string) bool {
return regexp.MustCompile(`\d`).MatchString(s)
}
// FAST
func HasDigits(s string) bool {
for _, c := range s {
if unicode.IsDigit(c) {
return true
}
}
return false
}
// Task: Check if string contains substring
// SLOW
func ContainsWordRegexp(s, word string) bool {
return regexp.MustCompile(`\b` + word + `\b`).MatchString(s)
}
// FAST
func ContainsWord(s, word string) bool {
return strings.Contains(s, word)
}
// Task: Check prefix/suffix
// SLOW
func StartsWithRegexp(s, prefix string) bool {
return regexp.MustCompile(`^` + prefix).MatchString(s)
}
// FAST
func StartsWith(s, prefix string) bool {
return strings.HasPrefix(s, prefix)
}
// Task: Extract first match
// SLOW
func ExtractFirstRegexp(s string) string {
return regexp.MustCompile(`\d+`).FindString(s)
}
// FAST (for simple pattern)
func ExtractFirst(s string) string {
start := -1
for i, c := range s {
if c >= '0' && c <= '9' {
if start == -1 {
start = i
}
} else if start != -1 {
return s[start:i]
}
}
if start != -1 {
return s[start:]
}
return ""
}
// Task: Split by delimiter
// SLOW
func SplitRegexp(s string) []string {
return regexp.MustCompile(`\s+`).Split(s, -1)
}
// FAST
func Split(s string) []string {
return strings.Fields(s) // Splits on whitespace
}
// Task: Find and replace
// SLOW
func ReplaceRegexp(s, pattern, replacement string) string {
return regexp.MustCompile(pattern).ReplaceAllString(s, replacement)
}
// FAST (for literal strings)
func Replace(s, old, new string) string {
return strings.ReplaceAll(s, old, new)
}
// Benchmark
func BenchmarkRegexpAlternatives(b *testing.B) {
text := "The quick brown fox jumps over the lazy dog"
b.Run("contains_regexp", func(b *testing.B) {
re := regexp.MustCompile("brown")
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
b.Run("contains_strings", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
strings.Contains(text, "brown")
}
})
b.Run("hasdigit_regexp", func(b *testing.B) {
re := regexp.MustCompile(`\d`)
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
b.Run("hasdigit_loop", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, c := range text {
if c >= '0' && c <= '9' {
break
}
}
}
})
}
// Result: Non-regexp is 5-20x faster for simple patternsRule of thumb: If you can solve it with
stringspackage, do it. Regexp is for genuinely complex patterns.
Submatch vs FindString: Memory Allocation
Extracting capture groups allocates more memory:
func BenchmarkSubmatches(b *testing.B) {
re := regexp.MustCompile(`(\d+)-(\w+)`)
text := "12345-hello"
b.Run("FindString", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = re.FindString(text)
}
})
b.Run("FindStringSubmatch", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = re.FindStringSubmatch(text)
}
})
b.Run("FindStringSubmatchIndex", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = re.FindStringSubmatchIndex(text)
}
})
}
// Results show submatches allocate more
// Use FindStringSubmatchIndex if you only need positionsThread Safety: Concurrent Use
regexp.Regexp is safe for concurrent use without additional synchronization:
// Safe to use from multiple goroutines
var re = regexp.MustCompile(`\d+`)
func ProcessConcurrently(inputs []string) {
var wg sync.WaitGroup
for _, input := range inputs {
wg.Add(1)
go func(text string) {
defer wg.Done()
result := re.FindString(text) // No mutex needed
_ = result
}(input)
}
wg.Wait()
}
// Benchmark: concurrency overhead
func BenchmarkConcurrentRegexp(b *testing.B) {
re := regexp.MustCompile(`\d+`)
texts := make([]string, 1000)
for i := 0; i < 1000; i++ {
texts[i] = fmt.Sprintf("text_%d", i)
}
b.Run("sequential", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, text := range texts {
re.FindString(text)
}
}
})
b.Run("concurrent", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
var wg sync.WaitGroup
for _, text := range texts {
wg.Add(1)
go func(t string) {
defer wg.Done()
re.FindString(t)
}(text)
}
wg.Wait()
}
})
}Pattern Features: Performance Impact
Different pattern elements have different costs:
func BenchmarkPatternFeatures(b *testing.B) {
text := "The quick brown fox jumps"
patterns := map[string]string{
"literal": "brown",
"anchor_start": "^The",
"anchor_end": "jumps$",
"char_class": "[aeiou]",
"digit": `\d+`,
"quantifier_star": "a*",
"quantifier_plus": "a+",
"alternation": "fox|dog",
"group": "(brown|fox)",
"non_capturing": "(?:brown|fox)",
}
for name, pattern := range patterns {
re := regexp.MustCompile(pattern)
b.Run(name, func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
}
}
// Rough ordering (fastest to slowest):
// 1. Anchored patterns (^$)
// 2. Literal strings
// 3. Character classes
// 4. Quantifiers (+ and *)
// 5. Alternation (|)
// 6. Groups with repetitionAlternatives for Maximum Speed
grafana/regexp: Optimized Fork
import "github.com/grafana/regexp"
// Drop-in replacement for regexp, optimized for common cases
func FastMatch() {
re := regexp.MustCompile(`\d+`)
re.MatchString("test123") // Faster than standard
}Manual State Machines
For critical paths, consider hand-coded parsers:
// Extract log level efficiently without regexp
type LogLevel int
const (
DEBUG LogLevel = iota
INFO
WARN
ERROR
)
func ParseLogLevelRegexp(s string) LogLevel {
re := regexp.MustCompile(`\[(DEBUG|INFO|WARN|ERROR)\]`)
matches := re.FindStringSubmatch(s)
if matches == nil {
return INFO
}
switch matches[1] {
case "DEBUG":
return DEBUG
case "INFO":
return INFO
case "WARN":
return WARN
case "ERROR":
return ERROR
}
return INFO
}
func ParseLogLevelManual(s string) LogLevel {
// Find bracket
start := strings.Index(s, "[")
if start == -1 {
return INFO
}
end := strings.Index(s[start:], "]")
if end == -1 {
return INFO
}
level := s[start+1 : start+end]
switch level {
case "DEBUG":
return DEBUG
case "INFO":
return INFO
case "WARN":
return WARN
case "ERROR":
return ERROR
}
return INFO
}
func BenchmarkParseLogLevel(b *testing.B) {
texts := make([]string, 100)
for i := 0; i < 100; i++ {
texts[i] = fmt.Sprintf("[INFO] Log message %d", i)
}
b.Run("regexp", func(b *testing.B) {
re := regexp.MustCompile(`\[(DEBUG|INFO|WARN|ERROR)\]`)
b.ResetTimer()
for i := 0; i < b.N; i++ {
ParseLogLevelRegexp(texts[i%100])
}
})
b.Run("manual", func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
ParseLogLevelManual(texts[i%100])
}
})
}Real-World: Parsing Log Lines Efficiently
// Parse log format: [2024-02-10 15:30:45] ERROR: Message here
type LogEntry struct {
Timestamp string
Level string
Message string
}
// Approach 1: Regex (flexible but slow)
var logRegexp = regexp.MustCompile(`\[(.+?)\] (\w+): (.+)`)
func ParseLogRegexp(line string) *LogEntry {
matches := logRegexp.FindStringSubmatch(line)
if matches == nil {
return nil
}
return &LogEntry{
Timestamp: matches[1],
Level: matches[2],
Message: matches[3],
}
}
// Approach 2: strings.Cut (Go 1.18+)
func ParseLogCut(line string) *LogEntry {
// Remove leading bracket and timestamp
if !strings.HasPrefix(line, "[") {
return nil
}
line = line[1:] // Remove [
timestamp, rest, ok := strings.Cut(line, "]")
if !ok {
return nil
}
rest = strings.TrimPrefix(rest, " ")
level, message, ok := strings.Cut(rest, ": ")
if !ok {
return nil
}
return &LogEntry{
Timestamp: timestamp,
Level: level,
Message: message,
}
}
// Approach 3: Manual parsing (fastest)
func ParseLogManual(line string) *LogEntry {
if len(line) < 2 || line[0] != '[' {
return nil
}
closeIdx := strings.IndexByte(line, ']')
if closeIdx == -1 || closeIdx+2 >= len(line) {
return nil
}
timestamp := line[1:closeIdx]
rest := line[closeIdx+2:] // Skip "] "
colonIdx := strings.IndexByte(rest, ':')
if colonIdx == -1 || colonIdx+2 >= len(rest) {
return nil
}
level := rest[:colonIdx]
message := rest[colonIdx+2:] // Skip ": "
return &LogEntry{
Timestamp: timestamp,
Level: level,
Message: message,
}
}
// Benchmark
func BenchmarkParseLog(b *testing.B) {
lines := make([]string, 1000)
for i := 0; i < 1000; i++ {
lines[i] = fmt.Sprintf("[2024-02-10 15:30:%02d] ERROR: Database connection failed", i%60)
}
b.Run("regexp", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ParseLogRegexp(lines[i%1000])
}
})
b.Run("strings.Cut", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ParseLogCut(lines[i%1000])
}
})
b.Run("manual", func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
ParseLogManual(lines[i%1000])
}
})
}
// Results: manual > strings.Cut > regexpPattern Optimization Tips
// Pattern design best practices
// 1. Anchor when possible
var goodPattern = regexp.MustCompile(`^user_\d+$`) // Fast
var badPattern = regexp.MustCompile(`user_\d+`) // Slow (searches entire string)
// 2. Avoid unnecessary groups
var efficientPattern = regexp.MustCompile(`foo(?:bar)`) // Non-capturing
var wastePattern = regexp.MustCompile(`foo(bar)`) // Capturing (slower)
// 3. Use specific character classes
var efficientClass = regexp.MustCompile(`[0-9]+`) // Specific range
var wasteClass = regexp.MustCompile(`[0-9a-zA-Z_-]+`) // Broader, slower
// 4. Prefer + over .*
var efficientQuantifier = regexp.MustCompile(`\w+`) // One or more
var wasteQuantifier = regexp.MustCompile(`\w*`) // Zero or more (slower)
// 5. Order alternatives by likelihood
var goodAlternation = regexp.MustCompile(`(?:the|a|an)`) // Most common first
var badAlternation = regexp.MustCompile(`(?:xyzzy|the|a)`) // Rare first
func BenchmarkPatternOptimization(b *testing.B) {
text := "user_12345"
b.Run("unanchored", func(b *testing.B) {
re := regexp.MustCompile(`user_\d+`)
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
b.Run("anchored", func(b *testing.B) {
re := regexp.MustCompile(`^user_\d+$`)
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.MatchString(text)
}
})
}Regexp Performance Checklist
- Compile once, reuse forever: Store as package variable with
MustCompile - Never use
regexp.Match(): 100x+ slower than compiled - Avoid regexp for simple patterns: Use
stringspackage instead - Use non-capturing groups:
(?:...)instead of(...) - Anchor patterns when possible:
^and$prevent unnecessary searching - Consider alternatives: Manual parsing, state machines for hot paths
- Profile before optimizing: Regexp may not be your bottleneck
- Prefer FindStringSubmatchIndex: If you only need positions
- Use grafana/regexp: For performance-critical applications
- Remember RE2 is safe: Linear time, no ReDoS attacks possible