name: regex description: Regex functionality in Nim including std/re, std/nre wrappers around PCRE, and the pure Nim nim-regex alternative with linear-time matching guarantees
Regex Skill for Nim
This skill covers regex functionality in Nim, including the standard library modules (std/re, std/nre), the pure Nim alternative (nim-regex), and the context around PCRE vs PCRE2 migration.
Overview of Nim's Regex Ecosystem
Nim provides multiple regex implementations:
- std/re: Legacy wrapper around PCRE (Perl-Compatible Regular Expressions)
- std/nre: Modern wrapper around PCRE with better API design
- nim-regex: Pure Nim implementation (drop-in replacement, linear-time matching)
The PCRE vs PCRE2 Context
Nim's standard library currently depends on PCRE (not PCRE2), which:
- Last release was in 2021 (no longer actively maintained)
- Is being deprecated in Debian stable (see Debian bug #1071970)
- Will be phased out in future distributions
The issue #23668 tracks the migration from PCRE to PCRE2. However, this is a significant undertaking since PCRE and PCRE2 are API-incompatible.
For users wanting to avoid PCRE dependency, nim-regex provides a pure Nim alternative.
std/re - Legacy PCRE Wrapper
The original regex module wrapping the PCRE C library.
Creating Regex Patterns
import std/re
# Basic pattern
let emailPattern = re"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
# With flags
let caseInsensitive = re"pattern", {reIgnoreCase}
let multiLine = re"^pattern", {reMultiLine}
let dotAll = re"pattern.*", {reDotAll}
let extended = re"pattern", {reExtended} # Ignores whitespace and comments
let studied = re"pattern", {reStudy} # Pre-analyzes for performance
# Extended regex with comments
let verbose = rex"""(?x)
\d+ # Match digits
.* # Followed by anything
"""
Matching Operations
import std/re
# Simple match check
if match("hello world", re"hello"):
echo "Found hello"
# Match with captures
var matches: array[3, string]
if match("abc123def", re"([a-z]+)(\d+)([a-z]+)", matches):
echo matches[0] # "abc"
echo matches[1] # "123"
echo matches[2] # "def"
# Find first occurrence
let pos = find("abcdefg", re"cde") # Returns 2
let posNotFound = find("abcdefg", re"xyz") # Returns -1
# Find with captures
var captures: array[2, string]
let foundPos = find("key=value; key2=value2", re"(\w+)=(\w+)", captures)
# captures[0] = "key", captures[1] = "value"
# Match length
let len = matchLen("abcdefg", re"cde", 2) # Returns 3
let lenNotFound = matchLen("abcdefg", re"xyz") # Returns -1
# Check if string contains pattern
if contains("abcdef", re"cde"):
echo "Pattern found"
# Starts with / Ends with
if startsWith("hello world", re"hello"):
echo "Starts with hello"
if endsWith("hello world", re"world"):
echo "Ends with world"
Find All Iterations
import std/re
# Iterator version
for word in findAll("the quick brown fox", re"\w+"):
echo word # "the", "quick", "brown", "fox"
# Seq version
let allMatches = findAll("abcabcabc", re"abc") # @["abc", "abc", "abc"]
Replace Operations
import std/re
# Simple replace
let replaced = "foo bar foo".replace(re"foo", "baz")
# "baz bar baz"
# Replace with captures
let replacedWithCaptures = "var1=key; var2=key2".replacef(re"(\w+)=(\w+)", "$1<-$2$2")
# "var1<-keykey; var2<-key2key2"
# Multi-replace in parallel
let multiResult = "abc123xyz".multiReplace([
(re"\d+", "NUM"),
(re"[a-z]+", "LET")
])
String Splitting
import std/re, std/sequtils
let parts = toSeq(split("a1b2c3", re"\d+"))
# @["a", "b", "c"]
let withCaptures = toSeq(split("a1b2", re"(\d)"))
# @["a", "1", "", "b", "2", ""]
=~ Macro (Implicit Matches)
import std/re
if "NAME = VALUE" =~ re"\s*(\w+)\s*=\s*(\w+)":
echo matches[0], " = ", matches[1] # NAME = VALUE
elif " # comment" =~ re"\s*(\#.*)":
echo matches[0] # "# comment"
std/nre - Modern PCRE Wrapper
Improved API over PCRE with Option-based returns and better ergonomics.
Creating Regex Patterns
import std/nre
# Basic pattern
let pattern = re"(\w+)@(\w+)\.(\w+)"
# With inline flags
let unicodePattern = re"(?i)hello" # Case insensitive
let multilinePattern = re"(?m)^start" # ^ matches line beginnings
let dotallPattern = re"(?s)pattern.*" # . matches newlines
let extendedPattern = re"(?x)pattern # comment" # Whitespace ignored
let ungreedyPattern = re"(?U)pattern.*" # Lazy quantification
# Pattern options (at start)
let utf8Pattern = re"(*UTF8)pattern" # Treat as UTF-8
let ucpPattern = re"(*UCP)\w+" # Unicode character properties
let crlfNewlines = re"(*CRLF)pattern" # CRLF line endings
let noAutoCapture = re"(*NO_AUTO_CAPTURE)(?<name>\w+)" # Manual captures only
Matching Operations
import std/nre, std/options
# Try to match entire string
let result = match("foobar", re"foobar")
# Result: Option[RegexMatch] = some(RegexMatch(...))
if result.isSome:
let m = result.get()
echo m.match # "foobar"
echo m.captures[0] # captured group if any
# Match with captures
let captureResult = match("john@example.com", re"(\w+)@(\w+)\.(\w+)")
if captureResult.isSome:
let m = captureResult.get()
echo m.captures[0] # "john"
echo m.captures[1] # "example"
echo m.captures[2] # "com"
# Named captures
let namedResult = match("john@example.com", re"(?<user>\w+)@(?<domain>\w+)")
if namedResult.isSome:
let m = namedResult.get()
echo m.captures["user"] # "john"
echo m.captures["domain"] # "example"
# Find substring
let findResult = find("email: john@example.com", re"(\w+)@(\w+)")
if findResult.isSome:
let m = findResult.get()
echo m.match # "john@example.com"
echo m.captures[0] # "john"
Iteration with findAll
import std/nre
# Find all matches
for m in findIter("one two three four", re"\w+"):
echo m.match # "one", "two", "three", "four"
# With captures
for m in findIter("a1b2c3", re"(\w)(\d)"):
echo m.captures[0], "-", m.captures[1] # "a-1", "b-2", "c-3"
# Get all as seq
let all = findAll("abcabc", re"abc")
# @["abc", "abc"]
Accessing Captures and Bounds
import std/nre
let m = find("test@example.com", re"(\w+)@(\w+)").get()
# Captures by index
echo m.captures[0] # "test"
echo m.captures[1] # "example"
echo m.captures[-1] # Full match: "test@example.com"
# Capture bounds (inclusive range)
echo m.captureBounds[0] # 0 .. 3
echo m.captureBounds[1] # 5 .. 11
# Check if capture group was matched
if 0 in m.captureBounds:
echo "Group 0 matched"
# Convert to table
let namedResult = match("key=value", re"(?<k>\w+)=(?<v>\w+)").get()
let table = namedResult.captures.toTable()
# {"k": "key", "v": "value"}
# Convert to seq
let seqResult = namedResult.captures.toSeq()
# @["key", "value"]
Splitting Strings
import std/nre
# Basic split
let parts = split("a1b2c3", re"\d+")
# @["a", "b", "c"]
# With captures included
let partsWithCaptures = split("a1b2", re"(\d)")
# @["a", "1", "", "b", "2", ""]
# Max splits
let limited = split("a1b2c3d4", re"\d", maxSplit = 2)
# @["a", "b", "c3d4"]
Replace Operations
import std/nre
# With proc replacement
let upperResult = replace("hello world", re"\w+", proc(m: RegexMatch): string =
m.match.toUpperAscii()
)
# "HELLO WORLD"
# With string replacement (captures with $N notation)
let formatted = replace("john@email.com", re"(\w+)@(\w+)", "$1 <at> $2")
# "john <at> email"
# Named captures
let namedReplaced = replace("key=value", re"(?<k>\w+)=(?<v>\w+)", "$k = $v")
# "key = value"
# Dollar sign escape
let dollarResult = replace("price: $100", re"\$\d+", "$$$")
# "price: $$"
Contains Check
import std/nre
if contains("abcdef", re"cde"):
echo "Contains pattern"
# With bounds
if contains("abcdef", re"cde", start = 1):
echo "Contains from position 1"
nim-regex - Pure Nim Drop-in Replacement
Pure Nim regex implementation with linear-time matching guarantees. Designed as a drop-in replacement for std/re and std/nre.
Why nim-regex?
- No C dependencies: Pure Nim, compiles to JavaScript/WebAssembly easily
- Linear-time matching: O(n) complexity, safe for untrusted input
- No backreferences: Simpler, faster, safer
- Drop-in replacement: Works where PCRE is unavailable
Creating Patterns with nim-regex
import regex
# Basic pattern (compile-time)
let pattern = re2"(\w+)@(\w+)\.(\w+)"
# Runtime compilation
let runtimePattern = re2(someString)
# With flags
let flags = {regexDotAll, regexCaseless}
let flaggedPattern = re2("pattern", flags)
# Arbitrary bytes mode (treat as bytes, not UTF-8)
let bytesPattern = re2(r"\xff\xfe", {regexArbitraryBytes})
# Raw string literal
let rawPattern = rex"""(?x)
\d+ # digits
.* # anything
"""
nim-regex API
import regex
# Match (whole string must match)
var m = RegexMatch2()
if match("abc", re2"abc", m):
echo m.group(0) # Full match bounds
# Find (substring match)
var findM = RegexMatch2()
if "abcd".find(re2"bc", findM):
echo findM.boundaries # 1 .. 2
# Find all
for match in findAll("abcabc", re2"abc"):
echo match.match
# Find all bounds
for bounds in findAllBounds("abcabc", re2"bc"):
echo bounds # 1 .. 2, 4 .. 5
# Contains
if re2"bc" in "abcd":
echo "Contains"
# Split
let parts = split("a1b2c3", re2"\d+")
# @["a", "b", "c"]
# Split including captures
let withCaps = splitIncl("a,b", re2"(,)")
# @["a", ",", "b"]
# Replace
let replaced = "aaa".replace(re2"a", "b", 1) # Limit to 1 replacement
# "baa"
# With capture references
let withCaptures = "abc".replace(re2"(a)(b)c", "m($1) m($2)")
# "m(a) m(b)"
# Replace with proc
proc removeStars(m: RegexMatch2, s: string): string =
if s[m.group(0)] == "*": ""
else: s[m.group(0)]
let cleaned = "**test**".replace(re2"(\*)", removeStars)
# "test"
# Starts with / Ends with
if "abc".startsWith(re2"\w"):
echo "Starts with word"
if "abc".endsWith(re2"\w"):
echo "Ends with word"
Accessing Results
import regex
var m = RegexMatch2()
discard "hello world".find(re2"(\w+) (\w+)", m)
# Groups by index
echo m.group(0) # 0 .. 4 (full match bounds)
echo m.group(1) # 0 .. 4 (first capture)
echo m.group(2) # 6 .. 10 (second capture)
# Groups by name
echo m.group("word1") # Bounds for named group
# Groups count
echo m.groupsCount # Number of capture groups
# Group names
echo m.groupNames # @["word1", "word2"] if named
# Captured text
let capturedText = "test string"[m.group(1)]
Match Macro
import regex
# Compile-time regex with macro
match "abc", rex"(\w)+":
echo matches # @["c"] - last capture in repeated group
match "[link](https://example.com)", rex"\[([^\]]+)\]\((https?://[^)]+)\)":
echo matches[0] # "link"
echo matches[1] # "https://example.com"
Compile-time vs Runtime Compilation
import regex
# Compile-time (static string)
const staticPattern = re2"\d+"
# Runtime (dynamic string)
let runtimePattern = re2(someString)
# Function with static parameter
func myMatch(s: string, exp: static string): bool =
const compiled = re2(exp)
s.match(compiled)
myMatch("123", r"\d+") # Compiles regex at compile time
Escape and Special Characters
import regex
# Escape regex special chars
let escaped = escapeRe("file.txt")
# Matches literal "file.txt", not regex pattern
# Special chars that need escaping:
# ' ', '#', '$', '&', '(', ')', '*', '+', '-', '.',
# '?', '[', '\\', ']', '^', '{', '|', '}', '~'
Unicode Considerations
import regex
# By default, Unicode friendly
assert match("弢弢弢", re2"\w+") # Works with CJK
# ASCII mode only
assert not match("弢弢弢", re2(r"\w+", {regexAscii}))
# Invalid UTF-8 handling (debug mode validates)
when not defined(release):
import unicode
assert validateUtf8("valid string") == -1
assert validateUtf8("\xf8\xa1\xa1\xa1\xa1") != -1
# Arbitrary bytes mode
let bytesFlags = {regexArbitraryBytes}
assert match("\xff\xfe", re2(r"\xff\xfe", bytesFlags))
Quick Comparison Reference
| Feature | std/re | std/nre | nim-regex |
|---|---|---|---|
| C Dependency | PCRE | PCRE | None |
| API Style | Return codes | Option[T] | Option[T] |
| Complexity | Varies | Varies | O(n) linear |
| Backreferences | Yes | Yes | No |
| Compile-time | No | No | Yes |
| JS Compatible | No | No | Yes |
| Drop-in Replace | Partial | Partial | Yes |
Choosing the Right Module
- std/re: Legacy code, simple use cases, when PCRE is already available
- std/nre: Modern PCRE wrapper, better API, when you need PCRE features
- nim-regex: No C deps needed, linear-time guarantees, WebAssembly targets
Common Patterns
# Email validation
const emailRe = re2"""(?x)
[a-zA-Z0-9._%+-]+
@
[a-zA-Z0-9.-]+
\.
[a-zA-Z]{2,}
"""
# IPv4 address
const ipv4Re = re2"""(?x)
\b
((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}
(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)
\b
"""
# URL pattern
const urlRe = re2"""(?x)
https?://
[^\s]+
"""
Error Handling
# std/nre / nim-regex errors
try:
let badPattern = re2"[unclosed"
except RegexError:
let e = getCurrentException()
echo "Regex error: ", e.msg
# Specific error types (nim-regex)
try:
discard re2(pattern)
except SyntaxError:
echo "Invalid regex syntax at pos ", e.pos
except StudyError:
echo "Regex study failed"
Performance Tips
- Use
reStudyflag in std/re for repeated matches - Compile patterns once, store in variables/constants
- nim-regex: Use compile-time strings (
constorstatic) - Prefer
containsoverfindfor boolean checks - Use
findAllBoundswhen you only need positions, not captures
PCRE2 - Direct Bindings for Maximum Performance
Tattletale provides direct PCRE2 bindings at workspace/pcre2/pcre2.nim for maximum performance and compatibility with tokenizers like TokenDagger and jtokkit.
Why Direct PCRE2?
- No thread-local storage: Pattern and match data stored in regular objects
- Full control: Access to all PCRE2 features including JIT compilation
- Matches tokenizers: Behavior compatible with C++ TokenDagger implementation
- 8-bit support: Uses PCRE2's native 8-bit API
Basic Usage
import workspace/pcre2/pcre2
# PCRE2 constants
const PCRE2_ZERO_TERMINATED = PCRE2_SIZE(-1)
const PCRE2_ERROR_NOMATCH = cint(-1)
const PCRE2_NOTEMPTY* = MatchOption(0x00000004'u32)
const PCRE2_UTF* = CompileOption(0x00080000'u32)
const PCRE2_UCP* = CompileOption(0x00020000'u32)
const PCRE2_NO_UTF_CHECK* = MatchOption(0x40000000'u32)
# Compile a pattern
var errorCode: CompileError
var errorOffset: csize_t
let pattern = r"\p{L}+" # Unicode letters
let code = compile(pattern, {PCRE2_UTF, PCRE2_UCP}, errorCode, errorOffset)
if code == nil:
echo "Compile error: ", errorCode, " at offset ", errorOffset
# Create match data from pattern (ensures correct size)
let matchData = match_data_create_from_pattern(code, nil)
# Get offset vector pointer
let ovector = get_ovector_pointer(matchData)
# Perform match
let rc = match(code, "hello world", 0, {PCRE2_NOTEMPTY}, matchData, nil)
if rc == PCRE2_ERROR_NOMATCH:
echo "No match"
elif rc >= 0:
let matchStart = ovector[0].int
let matchEnd = ovector[1].int
echo "Match at ", matchStart, " to ", matchEnd
# Cleanup
match_data_free(matchData)
code_free(code)
High-Level Pattern: Pcre2Code and Pcre2Matcher
import workspace/pcre2/pcre2
type
Pcre2Code* = object
code*: ptr Code
pattern*: string
Pcre2Matcher* = object
code*: ptr Code
matchData*: ptr MatchData
ovector*: ptr csize_t
ovectorCount*: uint32
proc compilePcre2*(pattern: string, utf8: bool = true): Pcre2Code =
var errorCode: CompileError
var errorOffset: csize_t
var options: set[CompileOption] = {PCRE2_UTF, PCRE2_UCP}
let code = compile(pattern, options, errorCode, errorOffset)
if code == nil:
raise newException(ValueError, "PCRE2 compile error")
Pcre2Code(code: code, pattern: pattern)
proc createMatcher*(code: Pcre2Code): Pcre2Matcher =
result.code = code.code
result.matchData = match_data_create_from_pattern(code.code, nil)
result.ovector = get_ovector_pointer(result.matchData)
result.ovectorCount = get_ovector_count(result.matchData)
proc findAllPcre2*(matcher: var Pcre2Matcher, text: string, startOffset: int = 0): seq[(int, int)] =
let subjectLen = text.len.csize_t
var offset = startOffset.csize_t
result = @[]
while offset < subjectLen:
let rc = match(matcher.code, text, offset, {PCRE2_NOTEMPTY, PCRE2_NO_UTF_CHECK}, matcher.matchData, nil)
if rc == PCRE2_ERROR_NOMATCH:
break
if rc < 0:
raise newException(ValueError, "Match error")
let matchStart = matcher.ovector[0].int
let matchEnd = matcher.ovector[1].int
result.add((matchStart, matchEnd))
offset = matchEnd.csize_t
if matchStart == matchEnd:
offset += 1 # Prevent infinite loop
result
proc free*(matcher: var Pcre2Matcher) =
if matcher.matchData != nil:
match_data_free(matcher.matchData)
matcher.matchData = nil
proc free*(code: var Pcre2Code) =
if code.code != nil:
code_free(code.code)
code.code = nil
# Usage
let pat = compilePcre2(r"\p{L}+")
var matcher = createMatcher(pat)
let matches = findAllPcre2(matcher, "Hello 世界 🌍")
# matches: @[(0, 5), (6, 8), (9, 10)]
matcher.free()
pat.free()
Tokenizer Integration Pattern
import std/tables
import workspace/pcre2/pcre2
type
BPETokenizer* = object
encoder*: Table[seq[byte], int]
pattern*: Pcre2Code
matcher*: Pcre2Matcher
proc splitText*(tokenizer: BPETokenizer, text: string): seq[string] =
result = @[]
let matches = findAllPcre2(tokenizer.matcher, text)
var lastPos = 0
for (start, end) in matches:
if start > lastPos:
result.add(text[lastPos..<start])
result.add(text[start..<end])
lastPos = end
if lastPos < text.len:
result.add(text[lastPos..<text.len])
result
Reference Documentation
- man pcre2api.3: Full PCRE2 API documentation
- man pcre2demo.3: Demonstration program showing correct usage
- workspace/pcre2/pcre2.nim: Tattletale's PCRE2 bindings
Key Takeaways
- Use
match_data_create_from_pattern()to ensure correct match data size - Use
get_ovector_pointer()to access match offsets - Call
match()in a loop with incrementing offsets for find-all - Always free resources with
match_data_free()andcode_free() - Store pattern and matcher in regular objects (not thread-local) for simplicity