use std::fmt; use logos::Logos; /// Token types for the Storybook language #[derive(Logos, Debug, Clone, PartialEq)] #[logos(skip r"[ \t\n\f]+")] // Skip whitespace #[allow(clippy::duplicated_attributes)] #[logos(skip(r"//[^\n]*", allow_greedy = true))] // Skip line comments #[logos(skip(r"/\*([^*]|\*[^/])*\*/", allow_greedy = true))] // Skip block comments pub enum Token { // Keywords #[token("use")] Use, #[token("character")] Character, #[token("template")] Template, #[token("life_arc")] LifeArc, #[token("schedule")] Schedule, #[token("behavior")] Behavior, #[token("institution")] Institution, #[token("relationship")] Relationship, #[token("location")] Location, #[token("species")] Species, #[token("concept")] Concept, #[token("sub_concept")] SubConcept, #[token("concept_comparison")] ConceptComparison, #[token("any")] Any, #[token("requires")] Requires, #[token("state")] State, #[token("on")] On, #[token("enter")] Enter, #[token("as")] As, #[token("self")] SelfKw, #[token("other")] Other, #[token("remove")] Remove, #[token("append")] Append, #[token("forall")] ForAll, #[token("exists")] Exists, #[token("in")] In, #[token("where")] Where, #[token("and")] And, #[token("or")] Or, #[token("not")] Not, #[token("strict")] Strict, #[token("include")] Include, #[token("from")] From, #[token("is")] Is, #[token("uses")] Uses, #[token("behaviors")] Behaviors, #[token("schedules")] Schedules, #[token("tree")] Tree, #[token("priority")] Priority, #[token("modifies")] Modifies, #[token("override")] Override, #[token("recurrence")] Recurrence, #[token("season")] Season, #[token("block")] Block, #[token("true")] True, #[token("false")] False, // Behavior tree keywords #[token("choose")] Choose, #[token("then")] Then, #[token("if")] If, #[token("when")] When, #[token("repeat")] Repeat, #[token("invert")] Invert, #[token("retry")] Retry, #[token("timeout")] Timeout, #[token("cooldown")] Cooldown, // "guard" keyword removed - use "if" instead (Token::If) #[token("succeed_always")] SucceedAlways, #[token("fail_always")] FailAlways, // Identifiers and literals #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |lex| lex.slice().to_string())] Ident(String), #[regex(r"-?[0-9]+", |lex| lex.slice().parse::().ok())] NumberLit(i64), #[regex(r"-?[0-9]+\.[0-9]+", |lex| lex.slice().parse::().ok())] DecimalLit(f64), #[regex(r#""([^"\\]|\\.)*""#, |lex| { let s = lex.slice(); s[1..s.len()-1].to_string() })] TextLit(String), // Time literal: HH:MM or HH:MM:SS #[regex(r"[0-9]{2}:[0-9]{2}(:[0-9]{2})?", |lex| lex.slice().to_string())] TimeLit(String), // Duration literal: e.g., 2h30m, 45m, 1h #[regex(r"[0-9]+[hms]([0-9]+[hms])*", |lex| lex.slice().to_string())] DurationLit(String), // Punctuation #[token("{")] LBrace, #[token("}")] RBrace, #[token("(")] LParen, #[token(")")] RParen, #[token("[")] LBracket, #[token("]")] RBracket, #[token(":")] Colon, #[token("::")] ColonColon, #[token(";")] Semicolon, #[token(",")] Comma, #[token(".")] Dot, #[token("..")] DotDot, #[token("*")] Star, #[token("?")] Question, #[token("@")] At, // Operators #[token(">")] Gt, #[token(">=")] Ge, #[token("<")] Lt, #[token("<=")] Le, #[token("->")] Arrow, // Special markers #[token("---")] ProseMarker, // Prose block (handled specially) ProseBlock(super::ast::ProseBlock), // Error token Error, } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { | Token::Ident(s) => write!(f, "identifier '{}'", s), | Token::NumberLit(n) => write!(f, "number {}", n), | Token::DecimalLit(n) => write!(f, "decimal {}", n), | Token::TextLit(s) => write!(f, "text \"{}\"", s), | Token::TimeLit(s) => write!(f, "time {}", s), | Token::DurationLit(s) => write!(f, "duration {}", s), | Token::ProseBlock(pb) => write!(f, "prose block ---{}", pb.tag), | _ => write!(f, "{:?}", self), } } } /// Lexer state machine for handling prose blocks #[derive(Debug, Clone)] enum LexerState { Normal, ProseTag, // After seeing first --- ProseContent(String, usize), // Tag + content start position } /// Wrapper lexer that handles two-mode scanning pub struct Lexer<'a> { source: &'a str, position: usize, state: LexerState, normal_lexer: Option>, lexer_base_offset: usize, // Offset of the substring that normal_lexer is lexing } impl<'a> Lexer<'a> { pub fn new(source: &'a str) -> Self { Self { source, position: 0, state: LexerState::Normal, normal_lexer: Some(Token::lexer(source)), lexer_base_offset: 0, } } fn scan_prose_tag(&mut self) -> Option<(usize, Token, usize)> { let _start = self.position; self.position += 3; // Skip --- // Skip whitespace while self.position < self.source.len() && self.source[self.position..].starts_with(|c: char| c.is_whitespace()) { self.position += 1; } // Read tag until whitespace or newline let tag_start = self.position; while self.position < self.source.len() { let ch = self.source[self.position..].chars().next().unwrap(); if ch.is_whitespace() { break; } self.position += ch.len_utf8(); } let tag = self.source[tag_start..self.position].to_string(); // Skip to end of line while self.position < self.source.len() { let ch = self.source[self.position..].chars().next().unwrap(); if ch == '\n' { self.position += 1; break; } self.position += ch.len_utf8(); } self.state = LexerState::ProseContent(tag, self.position); self.next() } fn scan_prose_content( &mut self, tag: String, content_start: usize, ) -> Option<(usize, Token, usize)> { let remaining = &self.source[content_start..]; let mut byte_offset = 0; // Scan until we find closing --- while byte_offset < remaining.len() { if remaining[byte_offset..].starts_with("---") { // Check if it's at start of line (or after whitespace) let is_line_start = byte_offset == 0 || remaining[..byte_offset] .chars() .rev() .take_while(|&c| c != '\n') .all(|c| c.is_whitespace()); if is_line_start { // Found closing marker let content_end = content_start + byte_offset; let content = self.source[content_start..content_end] .trim_end() .to_string(); let start = content_start.saturating_sub(tag.len() + 4); // Include opening ---tag self.position = content_end + 3; // Skip closing --- self.state = LexerState::Normal; self.lexer_base_offset = self.position; // Update base offset for new substring self.normal_lexer = Some(Token::lexer(&self.source[self.position..])); let prose_block = super::ast::ProseBlock { tag, content, span: super::ast::Span::new(start, self.position), }; return Some((start, Token::ProseBlock(prose_block), self.position)); } } // Advance by one UTF-8 character to avoid char boundary issues if let Some(ch) = remaining[byte_offset..].chars().next() { byte_offset += ch.len_utf8(); } else { break; } } // EOF reached without closing marker - treat as error None } } impl<'a> Iterator for Lexer<'a> { type Item = (usize, Token, usize); fn next(&mut self) -> Option { match &self.state { | LexerState::Normal => { let lexer = self.normal_lexer.as_mut()?; let token = lexer.next()?; let span = lexer.span(); match token { | Ok(Token::ProseMarker) => { // Switch to prose mode // span is relative to the substring that logos is lexing; add base offset self.position = self.lexer_base_offset + span.start; self.state = LexerState::ProseTag; self.normal_lexer = None; self.scan_prose_tag() }, | Ok(tok) => { // Adjust span to be relative to original source let absolute_start = self.lexer_base_offset + span.start; let absolute_end = self.lexer_base_offset + span.end; self.position = absolute_end; Some((absolute_start, tok, absolute_end)) }, | Err(_) => { // Adjust span to be relative to original source let absolute_start = self.lexer_base_offset + span.start; let absolute_end = self.lexer_base_offset + span.end; self.position = absolute_end; Some((absolute_start, Token::Error, absolute_end)) }, } }, | LexerState::ProseTag => { // Should not happen - scan_prose_tag transitions state None }, | LexerState::ProseContent(tag, content_start) => { let tag = tag.clone(); let content_start = *content_start; self.scan_prose_content(tag, content_start) }, } } } #[cfg(test)] mod tests { use super::*; #[test] fn test_basic_tokens() { let input = "character Martha { age: 34 }"; let lexer = Lexer::new(input); let tokens: Vec = lexer.map(|(_, tok, _)| tok).collect(); assert_eq!( tokens, vec![ Token::Character, Token::Ident("Martha".to_string()), Token::LBrace, Token::Ident("age".to_string()), Token::Colon, Token::NumberLit(34), Token::RBrace, ] ); } #[test] fn test_prose_block() { let input = r#" ---backstory Martha grew up in a small town. She loved baking from a young age. --- "#; let lexer = Lexer::new(input.trim()); let tokens: Vec = lexer.map(|(_, tok, _)| tok).collect(); assert_eq!(tokens.len(), 1); match &tokens[0] { | Token::ProseBlock(pb) => { assert_eq!(pb.tag, "backstory"); assert!(pb.content.contains("Martha grew up")); assert!(pb.content.contains("young age")); }, | _ => panic!("Expected ProseBlock, got {:?}", tokens[0]), } } #[test] fn test_prose_with_dashes_in_content() { let input = r#" ---description She was well-known for her kind-hearted nature. The bakery had a no-nonsense policy. --- "#; let lexer = Lexer::new(input.trim()); let tokens: Vec = lexer.map(|(_, tok, _)| tok).collect(); assert_eq!(tokens.len(), 1); match &tokens[0] { | Token::ProseBlock(pb) => { assert_eq!(pb.tag, "description"); assert!(pb.content.contains("well-known")); assert!(pb.content.contains("kind-hearted")); assert!(pb.content.contains("no-nonsense")); }, | _ => panic!("Expected ProseBlock"), } } #[test] fn test_multiple_prose_blocks() { let input = r#" ---description First prose block content. --- ---details Second prose block content. --- "#; let lexer = Lexer::new(input); let tokens: Vec = lexer.map(|(_, tok, _)| tok).collect(); assert_eq!(tokens.len(), 2, "Should have exactly 2 prose block tokens"); match &tokens[0] { | Token::ProseBlock(pb) => { assert_eq!(pb.tag, "description"); assert!(pb.content.contains("First prose block")); }, | _ => panic!("Expected first ProseBlock, got {:?}", tokens[0]), } match &tokens[1] { | Token::ProseBlock(pb) => { assert_eq!(pb.tag, "details"); assert!(pb.content.contains("Second prose block")); }, | _ => panic!("Expected second ProseBlock, got {:?}", tokens[1]), } } #[test] fn test_time_duration_literals() { let input = "08:30 14:45:00 2h30m 45m"; let lexer = Lexer::new(input); let tokens: Vec = lexer.map(|(_, tok, _)| tok).collect(); assert_eq!( tokens, vec![ Token::TimeLit("08:30".to_string()), Token::TimeLit("14:45:00".to_string()), Token::DurationLit("2h30m".to_string()), Token::DurationLit("45m".to_string()), ] ); } #[test] fn test_range_syntax() { let input = "20..40"; let lexer = Lexer::new(input); let tokens: Vec = lexer.map(|(_, tok, _)| tok).collect(); assert_eq!( tokens, vec![Token::NumberLit(20), Token::DotDot, Token::NumberLit(40),] ); } #[test] fn test_type_system_keywords() { let input = "concept sub_concept concept_comparison any"; let lexer = Lexer::new(input); let tokens: Vec = lexer.map(|(_, tok, _)| tok).collect(); assert_eq!( tokens, vec![ Token::Concept, Token::SubConcept, Token::ConceptComparison, Token::Any, ] ); } }