feat: implement storybook DSL with template composition and validation
Add complete domain-specific language for authoring narrative content for agent simulations. Features: - Complete parser using LALRPOP + logos lexer - Template composition (includes + multiple inheritance) - Strict mode validation for templates - Reserved keyword protection - Semantic validators (trait ranges, schedule overlaps, life arcs, behaviors) - Name resolution and cross-reference tracking - CLI tool (validate, inspect, query commands) - Query API with filtering - 260 comprehensive tests (unit, integration, property-based) Implementation phases: - Phase 1 (Parser): Complete - Phase 2 (Resolution + Validation): Complete - Phase 3 (Public API + CLI): Complete BREAKING CHANGE: Initial implementation
This commit is contained in:
424
src/syntax/lexer.rs
Normal file
424
src/syntax/lexer.rs
Normal file
@@ -0,0 +1,424 @@
|
||||
use std::fmt;
|
||||
|
||||
use logos::Logos;
|
||||
|
||||
/// Token types for the Storybook language
|
||||
#[derive(Logos, Debug, Clone, PartialEq)]
|
||||
#[logos(skip r"[ \t\n\f]+")] // Skip whitespace
|
||||
#[logos(skip r"//[^\n]*")] // Skip line comments
|
||||
#[logos(skip r"/\*([^*]|\*[^/])*\*/")] // Skip block comments
|
||||
pub enum Token {
|
||||
// Keywords
|
||||
#[token("use")]
|
||||
Use,
|
||||
#[token("character")]
|
||||
Character,
|
||||
#[token("template")]
|
||||
Template,
|
||||
#[token("life_arc")]
|
||||
LifeArc,
|
||||
#[token("schedule")]
|
||||
Schedule,
|
||||
#[token("behavior")]
|
||||
Behavior,
|
||||
#[token("institution")]
|
||||
Institution,
|
||||
#[token("relationship")]
|
||||
Relationship,
|
||||
#[token("location")]
|
||||
Location,
|
||||
#[token("species")]
|
||||
Species,
|
||||
#[token("enum")]
|
||||
Enum,
|
||||
#[token("state")]
|
||||
State,
|
||||
#[token("on")]
|
||||
On,
|
||||
#[token("as")]
|
||||
As,
|
||||
#[token("self")]
|
||||
SelfKw,
|
||||
#[token("other")]
|
||||
Other,
|
||||
#[token("remove")]
|
||||
Remove,
|
||||
#[token("append")]
|
||||
Append,
|
||||
#[token("forall")]
|
||||
ForAll,
|
||||
#[token("exists")]
|
||||
Exists,
|
||||
#[token("in")]
|
||||
In,
|
||||
#[token("where")]
|
||||
Where,
|
||||
#[token("and")]
|
||||
And,
|
||||
#[token("or")]
|
||||
Or,
|
||||
#[token("not")]
|
||||
Not,
|
||||
#[token("strict")]
|
||||
Strict,
|
||||
#[token("include")]
|
||||
Include,
|
||||
#[token("from")]
|
||||
From,
|
||||
#[token("is")]
|
||||
Is,
|
||||
#[token("true")]
|
||||
True,
|
||||
#[token("false")]
|
||||
False,
|
||||
|
||||
// Identifiers and literals
|
||||
#[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
|
||||
Ident(String),
|
||||
|
||||
#[regex(r"-?[0-9]+", |lex| lex.slice().parse::<i64>().ok())]
|
||||
IntLit(i64),
|
||||
|
||||
#[regex(r"-?[0-9]+\.[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
|
||||
FloatLit(f64),
|
||||
|
||||
#[regex(r#""([^"\\]|\\.)*""#, |lex| {
|
||||
let s = lex.slice();
|
||||
s[1..s.len()-1].to_string()
|
||||
})]
|
||||
StringLit(String),
|
||||
|
||||
// Time literal: HH:MM or HH:MM:SS
|
||||
#[regex(r"[0-9]{2}:[0-9]{2}(:[0-9]{2})?", |lex| lex.slice().to_string())]
|
||||
TimeLit(String),
|
||||
|
||||
// Duration literal: e.g., 2h30m, 45m, 1h
|
||||
#[regex(r"[0-9]+[hms]([0-9]+[hms])*", |lex| lex.slice().to_string())]
|
||||
DurationLit(String),
|
||||
|
||||
// Punctuation
|
||||
#[token("{")]
|
||||
LBrace,
|
||||
#[token("}")]
|
||||
RBrace,
|
||||
#[token("(")]
|
||||
LParen,
|
||||
#[token(")")]
|
||||
RParen,
|
||||
#[token("[")]
|
||||
LBracket,
|
||||
#[token("]")]
|
||||
RBracket,
|
||||
#[token(":")]
|
||||
Colon,
|
||||
#[token("::")]
|
||||
ColonColon,
|
||||
#[token(";")]
|
||||
Semicolon,
|
||||
#[token(",")]
|
||||
Comma,
|
||||
#[token(".")]
|
||||
Dot,
|
||||
#[token("..")]
|
||||
DotDot,
|
||||
#[token("*")]
|
||||
Star,
|
||||
#[token("?")]
|
||||
Question,
|
||||
#[token("@")]
|
||||
At,
|
||||
|
||||
// Operators
|
||||
#[token(">")]
|
||||
Gt,
|
||||
#[token(">=")]
|
||||
Ge,
|
||||
#[token("<")]
|
||||
Lt,
|
||||
#[token("<=")]
|
||||
Le,
|
||||
#[token("->")]
|
||||
Arrow,
|
||||
|
||||
// Special markers
|
||||
#[token("---")]
|
||||
ProseMarker,
|
||||
|
||||
// Prose block (handled specially)
|
||||
ProseBlock(super::ast::ProseBlock),
|
||||
|
||||
// Error token
|
||||
Error,
|
||||
}
|
||||
|
||||
impl fmt::Display for Token {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
| Token::Ident(s) => write!(f, "identifier '{}'", s),
|
||||
| Token::IntLit(n) => write!(f, "integer {}", n),
|
||||
| Token::FloatLit(n) => write!(f, "float {}", n),
|
||||
| Token::StringLit(s) => write!(f, "string \"{}\"", s),
|
||||
| Token::TimeLit(s) => write!(f, "time {}", s),
|
||||
| Token::DurationLit(s) => write!(f, "duration {}", s),
|
||||
| Token::ProseBlock(pb) => write!(f, "prose block ---{}", pb.tag),
|
||||
| _ => write!(f, "{:?}", self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Lexer state machine for handling prose blocks
|
||||
#[derive(Debug, Clone)]
|
||||
enum LexerState {
|
||||
Normal,
|
||||
ProseTag, // After seeing first ---
|
||||
ProseContent(String, usize), // Tag + content start position
|
||||
}
|
||||
|
||||
/// Wrapper lexer that handles two-mode scanning
|
||||
pub struct Lexer<'a> {
|
||||
source: &'a str,
|
||||
position: usize,
|
||||
state: LexerState,
|
||||
normal_lexer: Option<logos::Lexer<'a, Token>>,
|
||||
}
|
||||
|
||||
impl<'a> Lexer<'a> {
|
||||
pub fn new(source: &'a str) -> Self {
|
||||
Self {
|
||||
source,
|
||||
position: 0,
|
||||
state: LexerState::Normal,
|
||||
normal_lexer: Some(Token::lexer(source)),
|
||||
}
|
||||
}
|
||||
|
||||
fn scan_prose_tag(&mut self) -> Option<(usize, Token, usize)> {
|
||||
let _start = self.position;
|
||||
self.position += 3; // Skip ---
|
||||
|
||||
// Skip whitespace
|
||||
while self.position < self.source.len() &&
|
||||
self.source[self.position..].starts_with(|c: char| c.is_whitespace())
|
||||
{
|
||||
self.position += 1;
|
||||
}
|
||||
|
||||
// Read tag until whitespace or newline
|
||||
let tag_start = self.position;
|
||||
while self.position < self.source.len() {
|
||||
let ch = self.source[self.position..].chars().next().unwrap();
|
||||
if ch.is_whitespace() {
|
||||
break;
|
||||
}
|
||||
self.position += ch.len_utf8();
|
||||
}
|
||||
|
||||
let tag = self.source[tag_start..self.position].to_string();
|
||||
|
||||
// Skip to end of line
|
||||
while self.position < self.source.len() {
|
||||
let ch = self.source[self.position..].chars().next().unwrap();
|
||||
if ch == '\n' {
|
||||
self.position += 1;
|
||||
break;
|
||||
}
|
||||
self.position += ch.len_utf8();
|
||||
}
|
||||
|
||||
self.state = LexerState::ProseContent(tag, self.position);
|
||||
self.next()
|
||||
}
|
||||
|
||||
fn scan_prose_content(
|
||||
&mut self,
|
||||
tag: String,
|
||||
content_start: usize,
|
||||
) -> Option<(usize, Token, usize)> {
|
||||
let remaining = &self.source[content_start..];
|
||||
let mut byte_offset = 0;
|
||||
|
||||
// Scan until we find closing ---
|
||||
while byte_offset < remaining.len() {
|
||||
if remaining[byte_offset..].starts_with("---") {
|
||||
// Check if it's at start of line (or after whitespace)
|
||||
let is_line_start = byte_offset == 0 ||
|
||||
remaining[..byte_offset]
|
||||
.chars()
|
||||
.rev()
|
||||
.take_while(|&c| c != '\n')
|
||||
.all(|c| c.is_whitespace());
|
||||
|
||||
if is_line_start {
|
||||
// Found closing marker
|
||||
let content_end = content_start + byte_offset;
|
||||
let content = self.source[content_start..content_end]
|
||||
.trim_end()
|
||||
.to_string();
|
||||
let start = content_start.saturating_sub(tag.len() + 4); // Include opening ---tag
|
||||
self.position = content_end + 3; // Skip closing ---
|
||||
self.state = LexerState::Normal;
|
||||
self.normal_lexer = Some(Token::lexer(&self.source[self.position..]));
|
||||
|
||||
let prose_block = super::ast::ProseBlock {
|
||||
tag,
|
||||
content,
|
||||
span: super::ast::Span::new(start, self.position),
|
||||
};
|
||||
return Some((start, Token::ProseBlock(prose_block), self.position));
|
||||
}
|
||||
}
|
||||
|
||||
// Advance by one UTF-8 character to avoid char boundary issues
|
||||
if let Some(ch) = remaining[byte_offset..].chars().next() {
|
||||
byte_offset += ch.len_utf8();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// EOF reached without closing marker - treat as error
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Lexer<'a> {
|
||||
type Item = (usize, Token, usize);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match &self.state {
|
||||
| LexerState::Normal => {
|
||||
let lexer = self.normal_lexer.as_mut()?;
|
||||
|
||||
let token = lexer.next()?;
|
||||
let span = lexer.span();
|
||||
|
||||
match token {
|
||||
| Ok(Token::ProseMarker) => {
|
||||
// Switch to prose mode
|
||||
let marker_pos = span.start;
|
||||
self.position = marker_pos;
|
||||
self.state = LexerState::ProseTag;
|
||||
self.normal_lexer = None;
|
||||
self.scan_prose_tag()
|
||||
},
|
||||
| Ok(tok) => {
|
||||
self.position = span.end;
|
||||
Some((span.start, tok, span.end))
|
||||
},
|
||||
| Err(_) => {
|
||||
self.position = span.end;
|
||||
Some((span.start, Token::Error, span.end))
|
||||
},
|
||||
}
|
||||
},
|
||||
| LexerState::ProseTag => {
|
||||
// Should not happen - scan_prose_tag transitions state
|
||||
None
|
||||
},
|
||||
| LexerState::ProseContent(tag, content_start) => {
|
||||
let tag = tag.clone();
|
||||
let content_start = *content_start;
|
||||
self.scan_prose_content(tag, content_start)
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_basic_tokens() {
|
||||
let input = "character Martha { age: 34 }";
|
||||
let lexer = Lexer::new(input);
|
||||
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
||||
|
||||
assert_eq!(
|
||||
tokens,
|
||||
vec![
|
||||
Token::Character,
|
||||
Token::Ident("Martha".to_string()),
|
||||
Token::LBrace,
|
||||
Token::Ident("age".to_string()),
|
||||
Token::Colon,
|
||||
Token::IntLit(34),
|
||||
Token::RBrace,
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prose_block() {
|
||||
let input = r#"
|
||||
---backstory
|
||||
Martha grew up in a small town.
|
||||
She loved baking from a young age.
|
||||
---
|
||||
"#;
|
||||
let lexer = Lexer::new(input.trim());
|
||||
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
||||
|
||||
assert_eq!(tokens.len(), 1);
|
||||
match &tokens[0] {
|
||||
| Token::ProseBlock(pb) => {
|
||||
assert_eq!(pb.tag, "backstory");
|
||||
assert!(pb.content.contains("Martha grew up"));
|
||||
assert!(pb.content.contains("young age"));
|
||||
},
|
||||
| _ => panic!("Expected ProseBlock, got {:?}", tokens[0]),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prose_with_dashes_in_content() {
|
||||
let input = r#"
|
||||
---description
|
||||
She was well-known for her kind-hearted nature.
|
||||
The bakery had a no-nonsense policy.
|
||||
---
|
||||
"#;
|
||||
let lexer = Lexer::new(input.trim());
|
||||
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
||||
|
||||
assert_eq!(tokens.len(), 1);
|
||||
match &tokens[0] {
|
||||
| Token::ProseBlock(pb) => {
|
||||
assert_eq!(pb.tag, "description");
|
||||
assert!(pb.content.contains("well-known"));
|
||||
assert!(pb.content.contains("kind-hearted"));
|
||||
assert!(pb.content.contains("no-nonsense"));
|
||||
},
|
||||
| _ => panic!("Expected ProseBlock"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_time_duration_literals() {
|
||||
let input = "08:30 14:45:00 2h30m 45m";
|
||||
let lexer = Lexer::new(input);
|
||||
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
||||
|
||||
assert_eq!(
|
||||
tokens,
|
||||
vec![
|
||||
Token::TimeLit("08:30".to_string()),
|
||||
Token::TimeLit("14:45:00".to_string()),
|
||||
Token::DurationLit("2h30m".to_string()),
|
||||
Token::DurationLit("45m".to_string()),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_syntax() {
|
||||
let input = "20..40";
|
||||
let lexer = Lexer::new(input);
|
||||
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
||||
|
||||
assert_eq!(
|
||||
tokens,
|
||||
vec![Token::IntLit(20), Token::DotDot, Token::IntLit(40),]
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user