1033 lines
33 KiB
C
1033 lines
33 KiB
C
/* sed.c - stream editor. Thing that does s/// and other stuff.
|
|
*
|
|
* Copyright 2014 Rob Landley <rob@landley.net>
|
|
*
|
|
* See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
|
|
*
|
|
* TODO: lines > 2G could wrap signed int length counters. Not just getline()
|
|
* but N and s///
|
|
* TODO: make y// handle unicode, unicode delimiters
|
|
* TODO: handle error return from emit(), error_msg/exit consistently
|
|
* What's the right thing to do for -i when write fails? Skip to next?
|
|
* test '//q' with no previous regex, also repeat previous regex?
|
|
*
|
|
* Deviations from POSIX: allow extended regular expressions with -r,
|
|
* editing in place with -i, separate with -s, NUL-separated input with -z,
|
|
* printf escapes in text, line continuations, semicolons after all commands,
|
|
* 2-address anywhere an address is allowed, "T" command, multiline
|
|
* continuations for [abc], \; to end [abc] argument before end of line.
|
|
|
|
USE_SED(NEWTOY(sed, "(help)(version)e*f*i:;nErz(null-data)s[+Er]", TOYFLAG_BIN|TOYFLAG_LOCALE|TOYFLAG_NOHELP))
|
|
|
|
config SED
|
|
bool "sed"
|
|
default y
|
|
help
|
|
usage: sed [-inrszE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
|
|
|
|
Stream editor. Apply editing SCRIPTs to lines of input.
|
|
|
|
-e Add SCRIPT to list
|
|
-f Add contents of SCRIPT_FILE to list
|
|
-i Edit each file in place (-iEXT keeps backup file with extension EXT)
|
|
-n No default output (use the p command to output matched lines)
|
|
-r Use extended regular expression syntax
|
|
-E POSIX alias for -r
|
|
-s Treat input files separately (implied by -i)
|
|
-z Use \0 rather than \n as input line separator
|
|
|
|
A SCRIPT is one or more COMMANDs separated by newlines or semicolons.
|
|
All -e SCRIPTs are combined as if separated by newlines, followed by all -f
|
|
SCRIPT_FILEs. If no -e or -f then first argument is the SCRIPT.
|
|
|
|
COMMANDs apply to every line unless prefixed with an ADDRESS of the form:
|
|
|
|
[ADDRESS[,ADDRESS]][!]COMMAND
|
|
|
|
ADDRESS is a line number (starting at 1), a /REGULAR EXPRESSION/, or $ for
|
|
last line (-s or -i makes it last line of each file). One address matches one
|
|
line, ADDRESS,ADDRESS matches from first to second inclusive. Two regexes can
|
|
match multiple ranges. ADDRESS,+N ends N lines later. ! inverts the match.
|
|
|
|
REGULAR EXPRESSIONS start and end with the same character (anything but
|
|
backslash or newline). To use the delimiter in the regex escape it with a
|
|
backslash, and printf escapes (\abcefnrtv and octal, hex, and unicode) work.
|
|
An empty regex repeats the previous one. ADDRESS regexes require any
|
|
first delimiter except / to be \escaped to distinguish it from COMMANDs.
|
|
|
|
Sed reads each line of input, processes it, and writes it out or discards it
|
|
before reading the next. Sed can remember one additional line in a separate
|
|
buffer (the h, H, g, G, and x commands), and can read the next line of input
|
|
early (the n and N commands), but otherwise operates on individual lines.
|
|
|
|
Each COMMAND starts with a single character. Commands with no arguments are:
|
|
|
|
! Run this command when the ADDRESS _didn't_ match.
|
|
{ Start new command block, continuing until a corresponding "}".
|
|
Command blocks nest and can have ADDRESSes applying to the whole block.
|
|
} End command block (this COMMAND cannot have an address)
|
|
d Delete this line and move on to the next one
|
|
(ignores remaining COMMANDs)
|
|
D Delete one line of input and restart command SCRIPT (same as "d"
|
|
unless you've glued lines together with "N" or similar)
|
|
g Get remembered line (overwriting current line)
|
|
G Get remembered line (appending to current line)
|
|
h Remember this line (overwriting remembered line)
|
|
H Remember this line (appending to remembered line, if any)
|
|
l Print line escaping \abfrtv (but not \n), octal escape other nonprintng
|
|
chars, wrap lines to terminal width with \, append $ to end of line.
|
|
n Print default output and read next line over current line (quit at EOF)
|
|
N Append \n and next line of input to this line. Quit at EOF without
|
|
default output. Advances line counter for ADDRESS and "=".
|
|
p Print this line
|
|
P Print this line up to first newline (from "N")
|
|
q Quit (print default output, no more commands processed or lines read)
|
|
x Exchange this line with remembered line (overwrite in both directions)
|
|
= Print the current line number (plus newline)
|
|
# Comment, ignores rest of this line of SCRIPT (until newline)
|
|
|
|
Commands that take an argument:
|
|
|
|
: LABEL Target for jump commands
|
|
a TEXT Append text to output before reading next line
|
|
b LABEL Branch, jumps to :LABEL (with no LABEL to end of SCRIPT)
|
|
c TEXT Delete matching ADDRESS range and output TEXT instead
|
|
i TEXT Insert text (output immediately)
|
|
r FILE Append contents of FILE to output before reading next line.
|
|
s/S/R/F Search for regex S replace match with R using flags F. Delimiter
|
|
is anything but \n or \, escape with \ to use in S or R. Printf
|
|
escapes work. Unescaped & in R becomes full matched text, \1
|
|
through \9 = parenthetical subexpression from S. \ at end of
|
|
line appends next line of SCRIPT. The flags in F are:
|
|
[0-9] A number N, substitute only Nth match
|
|
g Global, substitute all matches
|
|
i/I Ignore case when matching
|
|
p Print resulting line when match found and replaced
|
|
w [file] Write (append) line to file when match replaced
|
|
t LABEL Test, jump if s/// command matched this line since last test
|
|
T LABEL Test false, jump to :LABEL only if no s/// found a match
|
|
w FILE Write (append) line to file
|
|
y/old/new/ Change each character in 'old' to corresponding character
|
|
in 'new' (with standard backslash escapes, delimiter can be
|
|
any repeated character except \ or \n)
|
|
|
|
The TEXT arguments (to a c i) may end with an unescaped "\" to append
|
|
the next line (leading whitespace is not skipped), and treat ";" as a
|
|
literal character (use "\;" instead).
|
|
*/
|
|
|
|
#define FOR_sed
|
|
#include "toys.h"
|
|
|
|
GLOBALS(
|
|
char *i;
|
|
struct arg_list *f, *e;
|
|
|
|
// processed pattern list
|
|
struct double_list *pattern;
|
|
|
|
char *nextline, *remember;
|
|
void *restart, *lastregex;
|
|
long nextlen, rememberlen, count;
|
|
int fdout, noeol;
|
|
unsigned xx;
|
|
char delim;
|
|
)
|
|
|
|
// Linked list of parsed sed commands. Offset fields indicate location where
|
|
// regex or string starts, ala offset+(char *)struct, because we remalloc()
|
|
// these to expand them for multiline inputs, and pointers would have to be
|
|
// individually adjusted.
|
|
|
|
struct sedcmd {
|
|
struct sedcmd *next, *prev;
|
|
|
|
// Begin and end of each match
|
|
long lmatch[2]; // line number of match
|
|
int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p)
|
|
int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
|
|
unsigned not, hit;
|
|
unsigned sflags; // s///flag bits: i=1, g=2, p=4, x=8
|
|
char c; // action
|
|
};
|
|
|
|
// Write out line with potential embedded NUL, handling eol/noeol
|
|
static int emit(char *line, long len, int eol)
|
|
{
|
|
int l, old = line[len];
|
|
|
|
if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
|
|
TT.noeol = !eol;
|
|
if (eol) line[len++] = '\n';
|
|
if (!len) return 0;
|
|
l = writeall(TT.fdout, line, len);
|
|
if (eol) line[len-1] = old;
|
|
if (l != len) {
|
|
if (TT.fdout != 1) perror_msg("short write");
|
|
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
// Extend allocation to include new string, with newline between if newlen<0
|
|
|
|
static char *extend_string(char **old, char *new, int oldlen, int newlen)
|
|
{
|
|
int newline = newlen < 0;
|
|
char *s;
|
|
|
|
if (newline) newlen = -newlen;
|
|
s = *old = xrealloc(*old, oldlen+newlen+newline+1);
|
|
if (newline) s[oldlen++] = '\n';
|
|
memcpy(s+oldlen, new, newlen);
|
|
s[oldlen+newlen] = 0;
|
|
|
|
return s+oldlen+newlen+1;
|
|
}
|
|
|
|
// An empty regex repeats the previous one
|
|
static void *get_regex(void *command, int offset)
|
|
{
|
|
if (!offset) {
|
|
if (!TT.lastregex) error_exit("no previous regex");
|
|
return TT.lastregex;
|
|
}
|
|
|
|
return TT.lastregex = offset+(char *)command;
|
|
}
|
|
|
|
// Apply pattern to line from input file
|
|
static void sed_line(char **pline, long plen)
|
|
{
|
|
struct append {
|
|
struct append *next, *prev;
|
|
int file;
|
|
char *str;
|
|
} *append = 0;
|
|
char *line = TT.nextline;
|
|
long len = TT.nextlen;
|
|
struct sedcmd *command;
|
|
int eol = 0, tea = 0;
|
|
|
|
// Ignore EOF for all files before last unless -i
|
|
if (!pline && !FLAG(i) && !FLAG(s)) return;
|
|
|
|
// Grab next line for deferred processing (EOF detection: we get a NULL
|
|
// pline at EOF to flush last line). Note that only end of _last_ input
|
|
// file matches $ (unless we're doing -i).
|
|
TT.nextline = 0;
|
|
TT.nextlen = 0;
|
|
if (pline) {
|
|
TT.nextline = *pline;
|
|
TT.nextlen = plen;
|
|
*pline = 0;
|
|
}
|
|
|
|
if (!line || !len) return;
|
|
if (line[len-1] == '\n') line[--len] = eol++;
|
|
TT.count++;
|
|
|
|
// The restart-1 is because we added one to make sure it wasn't NULL,
|
|
// otherwise N as last command would restart script
|
|
command = TT.restart ? ((struct sedcmd *)TT.restart)-1 : (void *)TT.pattern;
|
|
TT.restart = 0;
|
|
|
|
while (command) {
|
|
char *str, c = command->c;
|
|
|
|
// Have we got a line or regex matching range for this rule?
|
|
if (*command->lmatch || *command->rmatch) {
|
|
int miss = 0;
|
|
long lm;
|
|
|
|
// In a match that might end?
|
|
if (command->hit) {
|
|
if (!(lm = command->lmatch[1])) {
|
|
if (!command->rmatch[1]) command->hit = 0;
|
|
else {
|
|
void *rm = get_regex(command, command->rmatch[1]);
|
|
|
|
// regex match end includes matching line, so defer deactivation
|
|
if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
|
|
}
|
|
} else if (lm > 0 && lm < TT.count) command->hit = 0;
|
|
else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
|
|
|
|
// Start a new match?
|
|
} else {
|
|
if (!(lm = *command->lmatch)) {
|
|
void *rm = get_regex(command, *command->rmatch);
|
|
|
|
if (line && !regexec0(rm, line, len, 0, 0, 0))
|
|
command->hit = TT.count;
|
|
} else if (lm == TT.count || (lm == -1 && !pline))
|
|
command->hit = TT.count;
|
|
|
|
if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
|
|
}
|
|
|
|
// Didn't match?
|
|
lm = !(command->not^!!command->hit);
|
|
|
|
// Deferred disable from regex end match
|
|
if (miss || command->lmatch[1] == TT.count) command->hit = 0;
|
|
|
|
if (lm) {
|
|
// Handle skipping curly bracket command group
|
|
if (c == '{') {
|
|
int curly = 1;
|
|
|
|
while (curly) {
|
|
command = command->next;
|
|
if (command->c == '{') curly++;
|
|
if (command->c == '}') curly--;
|
|
}
|
|
}
|
|
command = command->next;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// A deleted line can still update line match state for later commands
|
|
if (!line) {
|
|
command = command->next;
|
|
continue;
|
|
}
|
|
|
|
// Process command
|
|
|
|
if (c=='a' || c=='r') {
|
|
struct append *a = xzalloc(sizeof(struct append));
|
|
if (command->arg1) a->str = command->arg1+(char *)command;
|
|
a->file = c=='r';
|
|
dlist_add_nomalloc((void *)&append, (void *)a);
|
|
} else if (c=='b' || c=='t' || c=='T') {
|
|
int t = tea;
|
|
|
|
if (c != 'b') tea = 0;
|
|
if (c=='b' || t^(c=='T')) {
|
|
if (!command->arg1) break;
|
|
str = command->arg1+(char *)command;
|
|
for (command = (void *)TT.pattern; command; command = command->next)
|
|
if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
|
|
break;
|
|
if (!command) error_exit("no :%s", str);
|
|
}
|
|
} else if (c=='c') {
|
|
str = command->arg1+(char *)command;
|
|
if (!command->hit) emit(str, strlen(str), 1);
|
|
free(line);
|
|
line = 0;
|
|
continue;
|
|
} else if (c=='d') {
|
|
free(line);
|
|
line = 0;
|
|
continue;
|
|
} else if (c=='D') {
|
|
// Delete up to \n or end of buffer
|
|
str = line;
|
|
while ((str-line)<len) if (*(str++) == '\n') break;
|
|
len -= str - line;
|
|
memmove(line, str, len);
|
|
|
|
// if "delete" blanks line, disable further processing
|
|
// otherwise trim and restart script
|
|
if (!len) {
|
|
free(line);
|
|
line = 0;
|
|
} else {
|
|
line[len] = 0;
|
|
command = (void *)TT.pattern;
|
|
}
|
|
continue;
|
|
} else if (c=='g') {
|
|
free(line);
|
|
line = xstrdup(TT.remember);
|
|
len = TT.rememberlen;
|
|
} else if (c=='G') {
|
|
line = xrealloc(line, len+TT.rememberlen+2);
|
|
line[len++] = '\n';
|
|
memcpy(line+len, TT.remember, TT.rememberlen);
|
|
line[len += TT.rememberlen] = 0;
|
|
} else if (c=='h') {
|
|
free(TT.remember);
|
|
TT.remember = xstrdup(line);
|
|
TT.rememberlen = len;
|
|
} else if (c=='H') {
|
|
TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
|
|
TT.remember[TT.rememberlen++] = '\n';
|
|
memcpy(TT.remember+TT.rememberlen, line, len);
|
|
TT.remember[TT.rememberlen += len] = 0;
|
|
} else if (c=='i') {
|
|
str = command->arg1+(char *)command;
|
|
emit(str, strlen(str), 1);
|
|
} else if (c=='l') {
|
|
int i, x, off;
|
|
|
|
if (!TT.xx) {
|
|
terminal_size(&TT.xx, 0);
|
|
if (!TT.xx) TT.xx = 80;
|
|
if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
|
|
if (TT.xx > 4) TT.xx -= 4;
|
|
}
|
|
|
|
for (i = off = 0; i<len; i++) {
|
|
if (off >= TT.xx) {
|
|
toybuf[off++] = '\\';
|
|
emit(toybuf, off, 1);
|
|
off = 0;
|
|
}
|
|
x = stridx("\\\a\b\f\r\t\v", line[i]);
|
|
if (x != -1) {
|
|
toybuf[off++] = '\\';
|
|
toybuf[off++] = "\\abfrtv"[x];
|
|
} else if (line[i] >= ' ') toybuf[off++] = line[i];
|
|
else off += sprintf(toybuf+off, "\\%03o", line[i]);
|
|
}
|
|
toybuf[off++] = '$';
|
|
emit(toybuf, off, 1);
|
|
} else if (c=='n') {
|
|
TT.restart = command->next+1;
|
|
|
|
break;
|
|
} else if (c=='N') {
|
|
// Can't just grab next line because we could have multiple N and
|
|
// we need to actually read ahead to get N;$p EOF detection right.
|
|
if (pline) {
|
|
TT.restart = command->next+1;
|
|
extend_string(&line, TT.nextline, len, -TT.nextlen);
|
|
free(TT.nextline);
|
|
TT.nextline = line;
|
|
TT.nextlen += len + 1;
|
|
line = 0;
|
|
}
|
|
|
|
// Pending append goes out right after N
|
|
goto done;
|
|
} else if (c=='p' || c=='P') {
|
|
char *l = (c=='P') ? strchr(line, '\n') : 0;
|
|
|
|
if (emit(line, l ? l-line : len, eol)) break;
|
|
} else if (c=='q' || c=='Q') {
|
|
if (pline) *pline = (void *)1;
|
|
free(TT.nextline);
|
|
if (!toys.exitval && command->arg1)
|
|
toys.exitval = atoi(command->arg1+(char *)command);
|
|
TT.nextline = 0;
|
|
TT.nextlen = 0;
|
|
if (c=='Q') line = 0;
|
|
|
|
break;
|
|
} else if (c=='s') {
|
|
char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
|
|
regmatch_t *match = (void *)toybuf;
|
|
regex_t *reg = get_regex(command, command->arg1);
|
|
int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
|
|
mlen, off, newlen;
|
|
|
|
// Loop finding match in remaining line (up to remaining len)
|
|
while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
|
|
mflags = REG_NOTBOL;
|
|
|
|
// Zero length matches don't count immediately after a previous match
|
|
mlen = match[0].rm_eo-match[0].rm_so;
|
|
if (!mlen && !zmatch) {
|
|
if (rline-line == len) break;
|
|
l2[l2used++] = *rline++;
|
|
zmatch++;
|
|
continue;
|
|
} else zmatch = 0;
|
|
|
|
// If we're replacing only a specific match, skip if this isn't it
|
|
off = command->sflags>>4;
|
|
if (off && off != ++count) {
|
|
if (l2) memcpy(l2+l2used, rline, match[0].rm_eo);
|
|
l2used += match[0].rm_eo;
|
|
rline += match[0].rm_eo;
|
|
|
|
continue;
|
|
}
|
|
// The fact getline() can allocate unbounded amounts of memory is
|
|
// a bigger issue, but while we're here check for integer overflow
|
|
if (match[0].rm_eo > INT_MAX) perror_exit(0);
|
|
|
|
// newlen = strlen(new) but with \1 and & and printf escapes
|
|
for (off = newlen = 0; new[off]; off++) {
|
|
int cc = -1;
|
|
|
|
if (new[off] == '&') cc = 0;
|
|
else if (new[off] == '\\') cc = new[++off] - '0';
|
|
if (cc < 0 || cc > 9) {
|
|
newlen++;
|
|
continue;
|
|
}
|
|
newlen += match[cc].rm_eo-match[cc].rm_so;
|
|
}
|
|
|
|
// Copy changed data to new string
|
|
|
|
// Adjust allocation size of new string, copy data we know we'll keep
|
|
l2l += newlen-mlen;
|
|
if ((l2l|0xfff) > l2old) l2 = xrealloc(l2, l2old = (l2l|0xfff)+1);
|
|
if (match[0].rm_so) {
|
|
memcpy(l2+l2used, rline, match[0].rm_so);
|
|
l2used += match[0].rm_so;
|
|
}
|
|
|
|
// copy in new replacement text
|
|
for (off = mlen = 0; new[off]; off++) {
|
|
int cc = 0, ll;
|
|
|
|
if (new[off] == '\\') {
|
|
cc = new[++off] - '0';
|
|
if (cc<0 || cc>9) {
|
|
if (!(l2[l2used+mlen++] = unescape(new[off])))
|
|
l2[l2used+mlen-1] = new[off];
|
|
|
|
continue;
|
|
} else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
|
|
} else if (new[off] != '&') {
|
|
l2[l2used+mlen++] = new[off];
|
|
|
|
continue;
|
|
}
|
|
|
|
if (match[cc].rm_so != -1) {
|
|
ll = match[cc].rm_eo-match[cc].rm_so;
|
|
memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
|
|
mlen += ll;
|
|
}
|
|
}
|
|
l2used += newlen;
|
|
rline += match[0].rm_eo;
|
|
|
|
// Stop after first substitution unless we have flag g
|
|
if (!(command->sflags & 2)) break;
|
|
}
|
|
|
|
// If we made any changes, finish off l2 and swap it for line
|
|
if (l2) {
|
|
// grab trailing unmatched data and null terminator, swap with original
|
|
mlen = len-(rline-line);
|
|
memcpy(l2+l2used, rline, mlen+1);
|
|
len = l2used + mlen;
|
|
free(line);
|
|
line = l2;
|
|
}
|
|
|
|
if (mflags) {
|
|
// flag p
|
|
if (command->sflags & 4) emit(line, len, eol);
|
|
|
|
tea = 1;
|
|
if (command->w) goto writenow;
|
|
}
|
|
} else if (c=='w') {
|
|
int fd, noeol;
|
|
char *name;
|
|
|
|
writenow:
|
|
// Swap out emit() context
|
|
fd = TT.fdout;
|
|
noeol = TT.noeol;
|
|
|
|
// We save filehandle and newline status before filename
|
|
name = command->w + (char *)command;
|
|
memcpy(&TT.fdout, name, 4);
|
|
name += 4;
|
|
TT.noeol = *(name++);
|
|
|
|
// write, then save/restore context
|
|
if (emit(line, len, eol))
|
|
perror_exit("w '%s'", command->arg1+(char *)command);
|
|
*(--name) = TT.noeol;
|
|
TT.noeol = noeol;
|
|
TT.fdout = fd;
|
|
} else if (c=='x') {
|
|
long swap = TT.rememberlen;
|
|
|
|
str = TT.remember;
|
|
TT.remember = line;
|
|
line = str;
|
|
TT.rememberlen = len;
|
|
len = swap;
|
|
} else if (c=='y') {
|
|
char *from, *to = (char *)command;
|
|
int i, j;
|
|
|
|
from = to+command->arg1;
|
|
to += command->arg2;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
j = stridx(from, line[i]);
|
|
if (j != -1) line[i] = to[j];
|
|
}
|
|
} else if (c=='=') {
|
|
sprintf(toybuf, "%ld", TT.count);
|
|
if (emit(toybuf, strlen(toybuf), 1)) break;
|
|
}
|
|
|
|
command = command->next;
|
|
}
|
|
|
|
if (line && !FLAG(n)) emit(line, len, eol);
|
|
|
|
done:
|
|
if (dlist_terminate(append)) while (append) {
|
|
struct append *a = append->next;
|
|
|
|
if (append->file) {
|
|
int fd = open(append->str, O_RDONLY);
|
|
|
|
// Force newline if noeol pending
|
|
if (fd != -1) {
|
|
if (TT.noeol) xwrite(TT.fdout, "\n", 1);
|
|
TT.noeol = 0;
|
|
xsendfile(fd, TT.fdout);
|
|
close(fd);
|
|
}
|
|
} else if (append->str) emit(append->str, strlen(append->str), 1);
|
|
else emit(line, 0, 0);
|
|
free(append);
|
|
append = a;
|
|
}
|
|
free(line);
|
|
}
|
|
|
|
// Callback called on each input file
|
|
static void do_sed_file(int fd, char *name)
|
|
{
|
|
char *tmp, *s;
|
|
|
|
if (FLAG(i)) {
|
|
if (!fd) return error_msg("-i on stdin");
|
|
TT.fdout = copy_tempfile(fd, name, &tmp);
|
|
}
|
|
if (FLAG(i) || FLAG(s)) {
|
|
struct sedcmd *command;
|
|
|
|
TT.count = 0;
|
|
for (command = (void *)TT.pattern; command; command = command->next)
|
|
command->hit = 0;
|
|
}
|
|
do_lines(fd, TT.delim, sed_line);
|
|
if (FLAG(i)) {
|
|
if (TT.i && *TT.i) {
|
|
xrename(name, s = xmprintf("%s%s", name, TT.i));
|
|
free(s);
|
|
}
|
|
replace_tempfile(-1, TT.fdout, &tmp);
|
|
TT.fdout = 1;
|
|
}
|
|
if (FLAG(i) || FLAG(s)) {
|
|
TT.nextline = 0;
|
|
TT.nextlen = TT.noeol = 0;
|
|
}
|
|
}
|
|
|
|
// Copy chunk of string between two delimiters, converting printf escapes.
|
|
// returns processed copy of string (0 if error), *pstr advances to next
|
|
// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
|
|
// if regxex, ignore delimiter in [ranges]
|
|
static char *unescape_delimited_string(char **pstr, char *delim)
|
|
{
|
|
char *to, *from, mode = 0, d;
|
|
|
|
// Grab leading delimiter (if necessary), allocate space for new string
|
|
from = *pstr;
|
|
if (!delim || !*delim) {
|
|
if (!(d = *(from++))) return 0;
|
|
if (d == '\\') d = *(from++);
|
|
if (!d || d == '\\') return 0;
|
|
if (delim) *delim = d;
|
|
} else d = *delim;
|
|
to = delim = xmalloc(strlen(*pstr)+1);
|
|
|
|
while (mode || *from != d) {
|
|
if (!*from) return 0;
|
|
|
|
// delimiter in regex character range doesn't count
|
|
if (*from == '[') {
|
|
if (!mode) {
|
|
mode = ']';
|
|
if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
|
|
} else if (mode == ']' && strchr(".=:", from[1])) {
|
|
*(to++) = *(from++);
|
|
mode = *from;
|
|
}
|
|
} else if (*from == mode) {
|
|
if (mode == ']') mode = 0;
|
|
else {
|
|
*(to++) = *(from++);
|
|
mode = ']';
|
|
}
|
|
// Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
|
|
// but the perl build does it, so we need to filter it out.
|
|
} else if (mode && *from == '-' && from[-1] == from[1]) {
|
|
from+=2;
|
|
continue;
|
|
} else if (*from == '\\') {
|
|
if (!from[1]) return 0;
|
|
|
|
// Check escaped end delimiter before printf style escapes.
|
|
if (from[1] == d) from++;
|
|
else if (from[1]=='\\') *(to++) = *(from++);
|
|
else {
|
|
char c = unescape(from[1]);
|
|
|
|
if (c) {
|
|
*(to++) = c;
|
|
from+=2;
|
|
continue;
|
|
} else if (!mode) *(to++) = *(from++);
|
|
}
|
|
}
|
|
*(to++) = *(from++);
|
|
}
|
|
*to = 0;
|
|
*pstr = from+1;
|
|
|
|
return delim;
|
|
}
|
|
|
|
// Translate pattern strings into command structures. Each command structure
|
|
// is a single allocation (which requires some math and remalloc at times).
|
|
static void parse_pattern(char **pline, long len)
|
|
{
|
|
struct sedcmd *command = (void *)TT.pattern;
|
|
char *line, *reg, c, *errstart;
|
|
int i;
|
|
|
|
line = errstart = pline ? *pline : "";
|
|
if (len && line[len-1]=='\n') line[--len] = 0;
|
|
|
|
// Append this line to previous multiline command? (hit indicates type.)
|
|
// During parsing "hit" stores data about line continuations, but in
|
|
// sed_line() it means the match range attached to this command
|
|
// is active, so processing the continuation must zero it again.
|
|
if (command && command->prev->hit) {
|
|
// Remove half-finished entry from list so remalloc() doesn't confuse it
|
|
TT.pattern = TT.pattern->prev;
|
|
command = dlist_pop(&TT.pattern);
|
|
c = command->c;
|
|
reg = (char *)command;
|
|
reg += command->arg1 + strlen(reg + command->arg1);
|
|
|
|
// Resume parsing for 'a' or 's' command. (Only two that can do this.)
|
|
// TODO: using 256 to indicate 'a' means our s/// delimiter can't be
|
|
// a unicode character.
|
|
if (command->hit < 256) goto resume_s;
|
|
else goto resume_a;
|
|
}
|
|
|
|
// Loop through commands in this line.
|
|
|
|
command = 0;
|
|
for (;;) {
|
|
if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
|
|
|
|
// If there's no more data on this line, return.
|
|
for (;;) {
|
|
while (isspace(*line) || *line == ';') line++;
|
|
if (*line == '#') while (*line && *line != '\n') line++;
|
|
else break;
|
|
}
|
|
if (!*line) return;
|
|
|
|
// Start by writing data into toybuf.
|
|
|
|
errstart = line;
|
|
memset(toybuf, 0, sizeof(struct sedcmd));
|
|
command = (void *)toybuf;
|
|
reg = toybuf + sizeof(struct sedcmd);
|
|
|
|
// Parse address range (if any)
|
|
for (i = 0; i < 2; i++) {
|
|
if (*line == ',') line++;
|
|
else if (i) break;
|
|
|
|
if (i && *line == '+' && isdigit(line[1])) {
|
|
line++;
|
|
command->lmatch[i] = -2-strtol(line, &line, 0);
|
|
} else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
|
|
else if (*line == '$') {
|
|
command->lmatch[i] = -1;
|
|
line++;
|
|
} else if (*line == '/' || *line == '\\') {
|
|
char *s = line;
|
|
|
|
if (!(s = unescape_delimited_string(&line, 0))) goto error;
|
|
if (!*s) command->rmatch[i] = 0;
|
|
else {
|
|
xregcomp((void *)reg, s, REG_EXTENDED*!!FLAG(r));
|
|
command->rmatch[i] = reg-toybuf;
|
|
reg += sizeof(regex_t);
|
|
}
|
|
free(s);
|
|
} else break;
|
|
}
|
|
|
|
while (isspace(*line)) line++;
|
|
if (!*line) break;
|
|
|
|
if (*line == '!') {
|
|
command->not = 1;
|
|
line++;
|
|
}
|
|
while (isspace(*line)) line++;
|
|
if (!*line) break;
|
|
|
|
c = command->c = *(line++);
|
|
if (strchr("}:", c) && i) break;
|
|
if (strchr("aiqQr=", c) && i>1) break;
|
|
|
|
// Allocate memory and copy out of toybuf now that we know how big it is
|
|
command = xmemdup(toybuf, reg-toybuf);
|
|
reg = (reg-toybuf) + (char *)command;
|
|
|
|
// Parse arguments by command type
|
|
if (c == '{') TT.nextlen++;
|
|
else if (c == '}') {
|
|
if (!TT.nextlen--) break;
|
|
} else if (c == 's') {
|
|
char *end, delim = 0;
|
|
int flags;
|
|
|
|
// s/pattern/replacement/flags
|
|
|
|
// line continuations use arg1 (back at the start of the function),
|
|
// so let's fill out arg2 first (since the regex part can't be multiple
|
|
// lines) and swap them back later.
|
|
|
|
// get pattern (just record, we parse it later)
|
|
command->arg2 = reg - (char *)command;
|
|
if (!(TT.remember = unescape_delimited_string(&line, &delim)))
|
|
goto error;
|
|
|
|
reg += sizeof(regex_t);
|
|
command->arg1 = reg-(char *)command;
|
|
command->hit = delim;
|
|
resume_s:
|
|
// get replacement - don't replace escapes yet because \1 and \& need
|
|
// processing later, after we replace \\ with \ we can't tell \\1 from \1
|
|
end = line;
|
|
while (*end != command->hit) {
|
|
if (!*end) goto error;
|
|
if (*end++ == '\\') {
|
|
if (!*end || *end == '\n') {
|
|
end[-1] = '\n';
|
|
break;
|
|
}
|
|
end++;
|
|
}
|
|
}
|
|
|
|
reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
|
|
line = end;
|
|
// line continuation? (note: '\n' can't be a valid delim).
|
|
if (*line == command->hit) command->hit = 0;
|
|
else {
|
|
if (!*line) continue;
|
|
reg--;
|
|
line++;
|
|
goto resume_s;
|
|
}
|
|
|
|
// swap arg1/arg2 so they're back in order arguments occur.
|
|
i = command->arg1;
|
|
command->arg1 = command->arg2;
|
|
command->arg2 = i;
|
|
|
|
// get flags
|
|
for (line++; *line; line++) {
|
|
long l;
|
|
|
|
if (isspace(*line) && *line != '\n') continue;
|
|
|
|
if (0 <= (l = stridx("igpx", *line))) command->sflags |= 1<<l;
|
|
else if (*line == 'I') command->sflags |= 1<<0;
|
|
else if (!(command->sflags>>4) && 0<(l = strtol(line, &line, 10))) {
|
|
command->sflags |= l << 4;
|
|
line--;
|
|
} else break;
|
|
}
|
|
flags = (FLAG(r) || (command->sflags&8)) ? REG_EXTENDED : 0;
|
|
if (command->sflags&1) flags |= REG_ICASE;
|
|
|
|
// We deferred actually parsing the regex until we had the s///i flag
|
|
// allocating the space was done by extend_string() above
|
|
if (!*TT.remember) command->arg1 = 0;
|
|
else xregcomp((void *)(command->arg1+(char *)command),TT.remember,flags);
|
|
free(TT.remember);
|
|
TT.remember = 0;
|
|
if (*line == 'w') {
|
|
line++;
|
|
goto writenow;
|
|
}
|
|
} else if (c == 'w') {
|
|
int fd, delim;
|
|
char *cc;
|
|
|
|
// Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
|
|
// eol status, and to retain the filename for error messages, we'd need
|
|
// to go up to arg5 just for this. Compromise: dynamically allocate the
|
|
// filehandle and eol status.
|
|
|
|
writenow:
|
|
while (isspace(*line)) line++;
|
|
if (!*line) goto error;
|
|
for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
|
|
delim = *cc;
|
|
*cc = 0;
|
|
fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC|O_APPEND, 0644);
|
|
*cc = delim;
|
|
|
|
command->w = reg - (char *)command;
|
|
command = xrealloc(command, command->w+(cc-line)+6);
|
|
reg = command->w + (char *)command;
|
|
|
|
memcpy(reg, &fd, 4);
|
|
reg += 4;
|
|
*(reg++) = 0;
|
|
memcpy(reg, line, delim);
|
|
reg += delim;
|
|
*(reg++) = 0;
|
|
|
|
line = cc;
|
|
if (delim) line += 2;
|
|
} else if (c == 'y') {
|
|
char *s, delim = 0;
|
|
int len;
|
|
|
|
if (!(s = unescape_delimited_string(&line, &delim))) goto error;
|
|
command->arg1 = reg-(char *)command;
|
|
len = strlen(s);
|
|
reg = extend_string((void *)&command, s, reg-(char *)command, len);
|
|
free(s);
|
|
command->arg2 = reg-(char *)command;
|
|
if (!(s = unescape_delimited_string(&line, &delim))) goto error;
|
|
if (len != strlen(s)) goto error;
|
|
reg = extend_string((void *)&command, s, reg-(char*)command, len);
|
|
free(s);
|
|
} else if (strchr("abcirtTqQw:", c)) {
|
|
int end;
|
|
|
|
// trim leading spaces
|
|
while (isspace(*line) && *line != '\n') line++;
|
|
|
|
// Resume logic differs from 's' case because we don't add a newline
|
|
// unless it's after something, so we add it on return instead.
|
|
resume_a:
|
|
command->hit = 0;
|
|
|
|
// btTqQ: end with space or semicolon, aicrw continue to newline.
|
|
if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
|
|
// Argument's optional for btTqQ
|
|
if (strchr("btTqQ", c)) continue;
|
|
else if (!command->arg1) break;
|
|
}
|
|
// Error checking: qQ can only have digits after them
|
|
if (c=='q' || c=='Q') {
|
|
for (i = 0; i<end && isdigit(line[i]); i++);
|
|
if (i != end) {
|
|
line += i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extend allocation to include new string. We use offsets instead of
|
|
// pointers so realloc() moving stuff doesn't break things. Ok to write
|
|
// \n over NUL terminator because call to extend_string() adds it back.
|
|
if (!command->arg1) command->arg1 = reg - (char*)command;
|
|
else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
|
|
else if (!pline) {
|
|
command->arg1 = 0;
|
|
continue;
|
|
}
|
|
reg = extend_string((void *)&command, line, reg - (char *)command, end);
|
|
|
|
// Recopy data to remove escape sequences and handle line continuation.
|
|
if (strchr("aci", c)) {
|
|
reg -= end+1;
|
|
for (i = end; i; i--) {
|
|
if ((*reg++ = *line++)=='\\') {
|
|
|
|
// escape at end of line: resume if -e escaped literal newline,
|
|
// else request callback and resume with next line
|
|
if (!--i) {
|
|
*--reg = 0;
|
|
if (*line) {
|
|
line++;
|
|
goto resume_a;
|
|
}
|
|
command->hit = 256;
|
|
break;
|
|
}
|
|
if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
|
|
line++;
|
|
}
|
|
}
|
|
*reg = 0;
|
|
} else line += end;
|
|
|
|
// Commands that take no arguments
|
|
} else if (!strchr("{dDgGhHlnNpPx=", c)) break;
|
|
}
|
|
|
|
error:
|
|
error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
|
|
}
|
|
|
|
void sed_main(void)
|
|
{
|
|
struct arg_list *al;
|
|
char **args = toys.optargs;
|
|
|
|
if (!FLAG(z)) TT.delim = '\n';
|
|
|
|
// Lie to autoconf when it asks stupid questions, so configure regexes
|
|
// that look for "GNU sed version %f" greater than some old buggy number
|
|
// don't fail us for not matching their narrow expectations.
|
|
if (FLAG(version)) {
|
|
xprintf("This is not GNU sed version 9.0\n");
|
|
return;
|
|
}
|
|
|
|
// Handling our own --version means we handle our own --help too.
|
|
if (FLAG(help)) help_exit(0);
|
|
|
|
// Parse pattern into commands.
|
|
|
|
// If no -e or -f, first argument is the pattern.
|
|
if (!TT.e && !TT.f) {
|
|
if (!*toys.optargs) error_exit("no pattern");
|
|
(TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
|
|
}
|
|
|
|
// Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
|
|
// so handle all -e, then all -f. (At least the behavior's consistent.)
|
|
|
|
for (al = TT.e; al; al = al->next) parse_pattern(&al->arg, strlen(al->arg));
|
|
parse_pattern(0, 0);
|
|
for (al = TT.f; al; al = al->next)
|
|
do_lines(xopenro(al->arg), TT.delim, parse_pattern);
|
|
dlist_terminate(TT.pattern);
|
|
if (TT.nextlen) error_exit("no }");
|
|
|
|
TT.fdout = 1;
|
|
TT.remember = xstrdup("");
|
|
|
|
// Inflict pattern upon input files. Long version because !O_CLOEXEC
|
|
loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
|
|
|
|
// Provide EOF flush at end of cumulative input for non-i mode.
|
|
if (!FLAG(i) && !FLAG(s)) {
|
|
toys.optflags |= FLAG_s;
|
|
sed_line(0, 0);
|
|
}
|
|
|
|
// todo: need to close fd when done for TOYBOX_FREE?
|
|
}
|