Updated dependencies to latest versions causing breaking changes: - logos: 0.14 -> 0.16 - lalrpop: 0.21 -> 0.23 - thiserror: 1.0 -> 2.0 - petgraph: 0.6 -> 0.8 - notify: 6.0 -> 8 - toml: 0.8 -> 1.0.2 - tree-sitter (grammar): 0.20 -> 0.26 Fixed compilation issues: 1. logos 0.16: Added allow_greedy for unbounded repetitions in lexer 2. lalrpop 0.23: Changed from process_current_dir() to process() 3. tree-sitter 0.26: Updated bindings to use &Language reference Also fixed Zed extension: - Removed local highlights.scm override that had diverged from source - Added regression test to prevent future divergence
539 lines
15 KiB
Rust
539 lines
15 KiB
Rust
use std::fmt;
|
|
|
|
use logos::Logos;
|
|
|
|
/// Token types for the Storybook language
|
|
#[derive(Logos, Debug, Clone, PartialEq)]
|
|
#[logos(skip r"[ \t\n\f]+")] // Skip whitespace
|
|
#[allow(clippy::duplicated_attributes)]
|
|
#[logos(skip(r"//[^\n]*", allow_greedy = true))] // Skip line comments
|
|
#[logos(skip(r"/\*([^*]|\*[^/])*\*/", allow_greedy = true))] // Skip block comments
|
|
pub enum Token {
|
|
// Keywords
|
|
#[token("use")]
|
|
Use,
|
|
#[token("character")]
|
|
Character,
|
|
#[token("template")]
|
|
Template,
|
|
#[token("life_arc")]
|
|
LifeArc,
|
|
#[token("schedule")]
|
|
Schedule,
|
|
#[token("behavior")]
|
|
Behavior,
|
|
#[token("institution")]
|
|
Institution,
|
|
#[token("relationship")]
|
|
Relationship,
|
|
#[token("location")]
|
|
Location,
|
|
#[token("species")]
|
|
Species,
|
|
#[token("concept")]
|
|
Concept,
|
|
#[token("sub_concept")]
|
|
SubConcept,
|
|
#[token("concept_comparison")]
|
|
ConceptComparison,
|
|
#[token("any")]
|
|
Any,
|
|
#[token("requires")]
|
|
Requires,
|
|
#[token("state")]
|
|
State,
|
|
#[token("on")]
|
|
On,
|
|
#[token("enter")]
|
|
Enter,
|
|
#[token("as")]
|
|
As,
|
|
#[token("self")]
|
|
SelfKw,
|
|
#[token("other")]
|
|
Other,
|
|
#[token("remove")]
|
|
Remove,
|
|
#[token("append")]
|
|
Append,
|
|
#[token("forall")]
|
|
ForAll,
|
|
#[token("exists")]
|
|
Exists,
|
|
#[token("in")]
|
|
In,
|
|
#[token("where")]
|
|
Where,
|
|
#[token("and")]
|
|
And,
|
|
#[token("or")]
|
|
Or,
|
|
#[token("not")]
|
|
Not,
|
|
#[token("strict")]
|
|
Strict,
|
|
#[token("include")]
|
|
Include,
|
|
#[token("from")]
|
|
From,
|
|
#[token("is")]
|
|
Is,
|
|
#[token("uses")]
|
|
Uses,
|
|
#[token("behaviors")]
|
|
Behaviors,
|
|
#[token("schedules")]
|
|
Schedules,
|
|
#[token("tree")]
|
|
Tree,
|
|
#[token("priority")]
|
|
Priority,
|
|
#[token("modifies")]
|
|
Modifies,
|
|
#[token("override")]
|
|
Override,
|
|
#[token("recurrence")]
|
|
Recurrence,
|
|
#[token("season")]
|
|
Season,
|
|
#[token("block")]
|
|
Block,
|
|
#[token("true")]
|
|
True,
|
|
#[token("false")]
|
|
False,
|
|
|
|
// Behavior tree keywords
|
|
#[token("choose")]
|
|
Choose,
|
|
#[token("then")]
|
|
Then,
|
|
#[token("if")]
|
|
If,
|
|
#[token("when")]
|
|
When,
|
|
#[token("repeat")]
|
|
Repeat,
|
|
#[token("invert")]
|
|
Invert,
|
|
#[token("retry")]
|
|
Retry,
|
|
#[token("timeout")]
|
|
Timeout,
|
|
#[token("cooldown")]
|
|
Cooldown,
|
|
// "guard" keyword removed - use "if" instead (Token::If)
|
|
#[token("succeed_always")]
|
|
SucceedAlways,
|
|
#[token("fail_always")]
|
|
FailAlways,
|
|
|
|
// Identifiers and literals
|
|
#[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
|
|
Ident(String),
|
|
|
|
#[regex(r"-?[0-9]+", |lex| lex.slice().parse::<i64>().ok())]
|
|
NumberLit(i64),
|
|
|
|
#[regex(r"-?[0-9]+\.[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
|
|
DecimalLit(f64),
|
|
|
|
#[regex(r#""([^"\\]|\\.)*""#, |lex| {
|
|
let s = lex.slice();
|
|
s[1..s.len()-1].to_string()
|
|
})]
|
|
TextLit(String),
|
|
|
|
// Time literal: HH:MM or HH:MM:SS
|
|
#[regex(r"[0-9]{2}:[0-9]{2}(:[0-9]{2})?", |lex| lex.slice().to_string())]
|
|
TimeLit(String),
|
|
|
|
// Duration literal: e.g., 2h30m, 45m, 1h
|
|
#[regex(r"[0-9]+[hms]([0-9]+[hms])*", |lex| lex.slice().to_string())]
|
|
DurationLit(String),
|
|
|
|
// Punctuation
|
|
#[token("{")]
|
|
LBrace,
|
|
#[token("}")]
|
|
RBrace,
|
|
#[token("(")]
|
|
LParen,
|
|
#[token(")")]
|
|
RParen,
|
|
#[token("[")]
|
|
LBracket,
|
|
#[token("]")]
|
|
RBracket,
|
|
#[token(":")]
|
|
Colon,
|
|
#[token("::")]
|
|
ColonColon,
|
|
#[token(";")]
|
|
Semicolon,
|
|
#[token(",")]
|
|
Comma,
|
|
#[token(".")]
|
|
Dot,
|
|
#[token("..")]
|
|
DotDot,
|
|
#[token("*")]
|
|
Star,
|
|
#[token("?")]
|
|
Question,
|
|
#[token("@")]
|
|
At,
|
|
|
|
// Operators
|
|
#[token(">")]
|
|
Gt,
|
|
#[token(">=")]
|
|
Ge,
|
|
#[token("<")]
|
|
Lt,
|
|
#[token("<=")]
|
|
Le,
|
|
#[token("->")]
|
|
Arrow,
|
|
|
|
// Special markers
|
|
#[token("---")]
|
|
ProseMarker,
|
|
|
|
// Prose block (handled specially)
|
|
ProseBlock(super::ast::ProseBlock),
|
|
|
|
// Error token
|
|
Error,
|
|
}
|
|
|
|
impl fmt::Display for Token {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
| Token::Ident(s) => write!(f, "identifier '{}'", s),
|
|
| Token::NumberLit(n) => write!(f, "number {}", n),
|
|
| Token::DecimalLit(n) => write!(f, "decimal {}", n),
|
|
| Token::TextLit(s) => write!(f, "text \"{}\"", s),
|
|
| Token::TimeLit(s) => write!(f, "time {}", s),
|
|
| Token::DurationLit(s) => write!(f, "duration {}", s),
|
|
| Token::ProseBlock(pb) => write!(f, "prose block ---{}", pb.tag),
|
|
| _ => write!(f, "{:?}", self),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Lexer state machine for handling prose blocks
|
|
#[derive(Debug, Clone)]
|
|
enum LexerState {
|
|
Normal,
|
|
ProseTag, // After seeing first ---
|
|
ProseContent(String, usize), // Tag + content start position
|
|
}
|
|
|
|
/// Wrapper lexer that handles two-mode scanning
|
|
pub struct Lexer<'a> {
|
|
source: &'a str,
|
|
position: usize,
|
|
state: LexerState,
|
|
normal_lexer: Option<logos::Lexer<'a, Token>>,
|
|
lexer_base_offset: usize, // Offset of the substring that normal_lexer is lexing
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
pub fn new(source: &'a str) -> Self {
|
|
Self {
|
|
source,
|
|
position: 0,
|
|
state: LexerState::Normal,
|
|
normal_lexer: Some(Token::lexer(source)),
|
|
lexer_base_offset: 0,
|
|
}
|
|
}
|
|
|
|
fn scan_prose_tag(&mut self) -> Option<(usize, Token, usize)> {
|
|
let _start = self.position;
|
|
self.position += 3; // Skip ---
|
|
|
|
// Skip whitespace
|
|
while self.position < self.source.len() &&
|
|
self.source[self.position..].starts_with(|c: char| c.is_whitespace())
|
|
{
|
|
self.position += 1;
|
|
}
|
|
|
|
// Read tag until whitespace or newline
|
|
let tag_start = self.position;
|
|
while self.position < self.source.len() {
|
|
let ch = self.source[self.position..].chars().next().unwrap();
|
|
if ch.is_whitespace() {
|
|
break;
|
|
}
|
|
self.position += ch.len_utf8();
|
|
}
|
|
|
|
let tag = self.source[tag_start..self.position].to_string();
|
|
|
|
// Skip to end of line
|
|
while self.position < self.source.len() {
|
|
let ch = self.source[self.position..].chars().next().unwrap();
|
|
if ch == '\n' {
|
|
self.position += 1;
|
|
break;
|
|
}
|
|
self.position += ch.len_utf8();
|
|
}
|
|
|
|
self.state = LexerState::ProseContent(tag, self.position);
|
|
self.next()
|
|
}
|
|
|
|
fn scan_prose_content(
|
|
&mut self,
|
|
tag: String,
|
|
content_start: usize,
|
|
) -> Option<(usize, Token, usize)> {
|
|
let remaining = &self.source[content_start..];
|
|
let mut byte_offset = 0;
|
|
|
|
// Scan until we find closing ---
|
|
while byte_offset < remaining.len() {
|
|
if remaining[byte_offset..].starts_with("---") {
|
|
// Check if it's at start of line (or after whitespace)
|
|
let is_line_start = byte_offset == 0 ||
|
|
remaining[..byte_offset]
|
|
.chars()
|
|
.rev()
|
|
.take_while(|&c| c != '\n')
|
|
.all(|c| c.is_whitespace());
|
|
|
|
if is_line_start {
|
|
// Found closing marker
|
|
let content_end = content_start + byte_offset;
|
|
let content = self.source[content_start..content_end]
|
|
.trim_end()
|
|
.to_string();
|
|
let start = content_start.saturating_sub(tag.len() + 4); // Include opening ---tag
|
|
self.position = content_end + 3; // Skip closing ---
|
|
self.state = LexerState::Normal;
|
|
self.lexer_base_offset = self.position; // Update base offset for new substring
|
|
self.normal_lexer = Some(Token::lexer(&self.source[self.position..]));
|
|
|
|
let prose_block = super::ast::ProseBlock {
|
|
tag,
|
|
content,
|
|
span: super::ast::Span::new(start, self.position),
|
|
};
|
|
return Some((start, Token::ProseBlock(prose_block), self.position));
|
|
}
|
|
}
|
|
|
|
// Advance by one UTF-8 character to avoid char boundary issues
|
|
if let Some(ch) = remaining[byte_offset..].chars().next() {
|
|
byte_offset += ch.len_utf8();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// EOF reached without closing marker - treat as error
|
|
None
|
|
}
|
|
}
|
|
|
|
impl<'a> Iterator for Lexer<'a> {
|
|
type Item = (usize, Token, usize);
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
match &self.state {
|
|
| LexerState::Normal => {
|
|
let lexer = self.normal_lexer.as_mut()?;
|
|
|
|
let token = lexer.next()?;
|
|
let span = lexer.span();
|
|
|
|
match token {
|
|
| Ok(Token::ProseMarker) => {
|
|
// Switch to prose mode
|
|
// span is relative to the substring that logos is lexing; add base offset
|
|
self.position = self.lexer_base_offset + span.start;
|
|
self.state = LexerState::ProseTag;
|
|
self.normal_lexer = None;
|
|
self.scan_prose_tag()
|
|
},
|
|
| Ok(tok) => {
|
|
// Adjust span to be relative to original source
|
|
let absolute_start = self.lexer_base_offset + span.start;
|
|
let absolute_end = self.lexer_base_offset + span.end;
|
|
self.position = absolute_end;
|
|
Some((absolute_start, tok, absolute_end))
|
|
},
|
|
| Err(_) => {
|
|
// Adjust span to be relative to original source
|
|
let absolute_start = self.lexer_base_offset + span.start;
|
|
let absolute_end = self.lexer_base_offset + span.end;
|
|
self.position = absolute_end;
|
|
Some((absolute_start, Token::Error, absolute_end))
|
|
},
|
|
}
|
|
},
|
|
| LexerState::ProseTag => {
|
|
// Should not happen - scan_prose_tag transitions state
|
|
None
|
|
},
|
|
| LexerState::ProseContent(tag, content_start) => {
|
|
let tag = tag.clone();
|
|
let content_start = *content_start;
|
|
self.scan_prose_content(tag, content_start)
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_basic_tokens() {
|
|
let input = "character Martha { age: 34 }";
|
|
let lexer = Lexer::new(input);
|
|
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
|
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Token::Character,
|
|
Token::Ident("Martha".to_string()),
|
|
Token::LBrace,
|
|
Token::Ident("age".to_string()),
|
|
Token::Colon,
|
|
Token::NumberLit(34),
|
|
Token::RBrace,
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_prose_block() {
|
|
let input = r#"
|
|
---backstory
|
|
Martha grew up in a small town.
|
|
She loved baking from a young age.
|
|
---
|
|
"#;
|
|
let lexer = Lexer::new(input.trim());
|
|
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
|
|
|
assert_eq!(tokens.len(), 1);
|
|
match &tokens[0] {
|
|
| Token::ProseBlock(pb) => {
|
|
assert_eq!(pb.tag, "backstory");
|
|
assert!(pb.content.contains("Martha grew up"));
|
|
assert!(pb.content.contains("young age"));
|
|
},
|
|
| _ => panic!("Expected ProseBlock, got {:?}", tokens[0]),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_prose_with_dashes_in_content() {
|
|
let input = r#"
|
|
---description
|
|
She was well-known for her kind-hearted nature.
|
|
The bakery had a no-nonsense policy.
|
|
---
|
|
"#;
|
|
let lexer = Lexer::new(input.trim());
|
|
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
|
|
|
assert_eq!(tokens.len(), 1);
|
|
match &tokens[0] {
|
|
| Token::ProseBlock(pb) => {
|
|
assert_eq!(pb.tag, "description");
|
|
assert!(pb.content.contains("well-known"));
|
|
assert!(pb.content.contains("kind-hearted"));
|
|
assert!(pb.content.contains("no-nonsense"));
|
|
},
|
|
| _ => panic!("Expected ProseBlock"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_multiple_prose_blocks() {
|
|
let input = r#"
|
|
---description
|
|
First prose block content.
|
|
---
|
|
---details
|
|
Second prose block content.
|
|
---
|
|
"#;
|
|
let lexer = Lexer::new(input);
|
|
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
|
|
|
assert_eq!(tokens.len(), 2, "Should have exactly 2 prose block tokens");
|
|
|
|
match &tokens[0] {
|
|
| Token::ProseBlock(pb) => {
|
|
assert_eq!(pb.tag, "description");
|
|
assert!(pb.content.contains("First prose block"));
|
|
},
|
|
| _ => panic!("Expected first ProseBlock, got {:?}", tokens[0]),
|
|
}
|
|
|
|
match &tokens[1] {
|
|
| Token::ProseBlock(pb) => {
|
|
assert_eq!(pb.tag, "details");
|
|
assert!(pb.content.contains("Second prose block"));
|
|
},
|
|
| _ => panic!("Expected second ProseBlock, got {:?}", tokens[1]),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_time_duration_literals() {
|
|
let input = "08:30 14:45:00 2h30m 45m";
|
|
let lexer = Lexer::new(input);
|
|
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
|
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Token::TimeLit("08:30".to_string()),
|
|
Token::TimeLit("14:45:00".to_string()),
|
|
Token::DurationLit("2h30m".to_string()),
|
|
Token::DurationLit("45m".to_string()),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_range_syntax() {
|
|
let input = "20..40";
|
|
let lexer = Lexer::new(input);
|
|
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
|
|
|
assert_eq!(
|
|
tokens,
|
|
vec![Token::NumberLit(20), Token::DotDot, Token::NumberLit(40),]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_type_system_keywords() {
|
|
let input = "concept sub_concept concept_comparison any";
|
|
let lexer = Lexer::new(input);
|
|
let tokens: Vec<Token> = lexer.map(|(_, tok, _)| tok).collect();
|
|
|
|
assert_eq!(
|
|
tokens,
|
|
vec![
|
|
Token::Concept,
|
|
Token::SubConcept,
|
|
Token::ConceptComparison,
|
|
Token::Any,
|
|
]
|
|
);
|
|
}
|
|
}
|