summaryrefslogtreecommitdiffstats
path: root/autotab.c
blob: a841eeb42fe15e8a053fa6c5a56052f8572bf5a3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
/*
 * Autotab.
 *
 * A program to detect the tabbing style of a text file, and report
 * it as a Vim command to set up the tabstop, shiftwidth and expandtab
 * parameters.
 *
 * Copyright 2007-2016
 * Kaz Kylheku <kaz@kylheku.com>
 * Vancouver, Canada
 *
 * To use this, compile to an executable called "autotab".
 * Then in the .vimrc file, add something like this:
 *
 *   :au BufRead * execute 'set' system("autotab < " . bufname("%"))
 *
 * Or, better still, with this (all joined to one line):
 *
 *   :au BufReadPost * if bufname("%") != "" |
 *     execute 'set' system("autotab < " . bufname("%")) | endif
 */

#define AUTOTAB_VER 5
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <stddef.h>
#include <ctype.h>
#include <math.h>

#define MAX_SNARF_LINES 5000 /* only sample so many lines from the input */

#define MIN(A, B) ((A) < (B) ? (A) : (B))

const char *name = "autotab";

typedef struct line {
  struct line *next;
  char *str;
} line_t;

typedef struct {
  char *base;
  char *next;
  size_t size;
} buffer_t;

int debug_enabled;

static void oops(char *fmt, ...)
{
  va_list vl;
  va_start (vl, fmt);
  fprintf(stderr, "%s: ", name);
  vfprintf(stderr, fmt, vl);
  va_end (vl);
}

static void debug(char *fmt, ...)
{
  if (debug_enabled) {
    va_list vl;
    va_start (vl, fmt);
    fprintf(stdout, "%s: ", name);
    vfprintf(stdout, fmt, vl);
    va_end (vl);
  }
}

static int grow_buffer(buffer_t *buf)
{
  char *base = buf->base, *next = buf->next;
  size_t size = buf->size;
  size_t new_size = (size == 0) ? 512 : size * 2;
  ptrdiff_t delta = base ? next - base : 0;
  char *new_base;

  if (base != 0 && next < base + size)
    return 1;

  if (new_size < size) {
    oops("overflow due to excessively long line\n");
    return 0;
  }

  new_base = realloc(base, new_size);

  if (new_base == 0) {
    oops("out of memory\n");
    return 0;
  }

  buf->base = new_base;
  buf->next = new_base + delta;
  buf->size = new_size;
  return 1;
}

static int add_char(buffer_t *buf, int ch)
{
  if (!grow_buffer(buf))
    return 0;
  *buf->next++ = ch;
  return 1;
}

static void trim_buffer(buffer_t *buf)
{
  char *base = buf->base, *next = buf->next;
  ptrdiff_t delta = base ? next - base : 0;
  char *new_base = delta ? realloc(base, delta) : 0;

  if (new_base != 0) {
    buf->base = new_base;
    buf->next = new_base + delta;
    buf->size = delta;
  }
}

static void discard_buffer(buffer_t *buf)
{
  static buffer_t blank = { 0 };
  free(buf->base);
  *buf = blank;
}

static char *extract_buffer(buffer_t *buf)
{
  static buffer_t blank = { 0 };
  char *str = buf->base;
  *buf = blank;
  return str;
}

size_t buffer_size(buffer_t *buf)
{
  if (buf->base == 0)
    return 0;
  return buf->next - buf->base;
}

static int buffer_blank(buffer_t *buf)
{
  return buf->base == 0;
}

static char *snarf_line(FILE *stream)
{
  buffer_t buf = { 0 };

  for (;;) {
    int ch = getc(stream);

    /* EOF must cause null pointer return,
       not a pointer to an empty string. */
    if (ch == EOF && buffer_blank(&buf))
      break;

    if (ch == EOF || ch == '\n') {
      if (!add_char(&buf, 0))
        goto oops_out;
      break;
    }

    add_char(&buf, ch);
  }

  trim_buffer(&buf);
  return extract_buffer(&buf);

oops_out:
  discard_buffer(&buf);
  return 0;
}

static line_t *push_line(char *str, line_t *lines)
{
  line_t *new_line = malloc(sizeof *new_line);

  if (new_line != 0) {
    new_line->next = lines;
    new_line->str = str;
    return new_line;
  }

  return 0;
}

static line_t *nreverse_lines(line_t *lines)
{
  line_t *new_list = 0, *next;

  while (lines != 0) {
    next = lines->next;
    lines->next = new_list;
    new_list = lines;
    lines = next;
  }

  return new_list;
}


static line_t *snarf_lines(FILE *stream)
{
  line_t *list = 0;
  int i;

  for (i = 0; i < MAX_SNARF_LINES; i++) {
    char *str = snarf_line(stream);
    line_t *new_list;

    if (str == 0)
      break;

    if ((new_list = push_line(str, list)) == 0) {
      oops("out of memory\n");
      free(str);
      break;
    }

    list = new_list;
  }

  return nreverse_lines(list);
}

static void free_lines(line_t *lines)
{
  line_t *next;

  while (lines != 0) {
    next = lines->next;
    free(lines->str);
    free(lines);
    lines = next;
  }
}

static int fgrep(line_t *list, const char *pattern)
{
  while (list != 0 && strstr(list->str, pattern) == 0)
    list = list->next;
  return list != 0;
}

#define SPACE  " "
#define LETAB  "."
#define INTAB  "-"
#define OPGRP  "("
#define CLGRP  ")"
#define TOKEN  "x"
#define PUNCT  "|"

#define ANYSP  SPACE LETAB INTAB
#define NONSP  OPGRP CLGRP TOKEN PUNCT

int munge_char(int ch)
{
  if (ch < 0 || isalnum(ch) || ch == '_')
    return TOKEN[0];
  if (isspace(ch) || iscntrl(ch))
    return SPACE[0];
  if (strchr("([{<", ch))
    return OPGRP[0];
  if (strchr(")]}>", ch))
    return CLGRP[0];
  return PUNCT[0];
}

/*
 * This function expands leading tabs into one or more '.' characters
 * representing spaces, inner tabs into '-' characters, punctuation characters
 * into '|', and non-puncutation into 'x'. The resulting data is used
 * as the basis for some pattern matching to determine alignment between
 * lines.
 */
static char *tab_munge_line(const char *str, int tabsize)
{
  buffer_t buf = { 0 };
  int col = 0;
  int leading_tabs = 1;
  size_t after_last_nonspace = 0;
  char *ret;

  if (!grow_buffer(&buf))
    goto oops_out;

  for (;; str++) {
    switch (*str) {
    case 0:
      if (!add_char(&buf, 0))
        goto oops_out;
      break;
    case '\t':
      do {
        if (!add_char(&buf, leading_tabs ? LETAB[0] : INTAB[0]))
          goto oops_out;
      } while ((++col % tabsize) != 0);
      continue;
    default:
      leading_tabs = 0;
      {
        if (!add_char(&buf, munge_char(*str)))
          goto oops_out;
        col++;
        after_last_nonspace = buffer_size(&buf);
      }
      continue;
    }
    break;
  }

  trim_buffer(&buf);
  ret = extract_buffer(&buf);
  ret[after_last_nonspace] = 0;
  return ret;

oops_out:
  discard_buffer(&buf);
  return 0;
}

static line_t *filter_lines(line_t *lines, int arg,
                            char *(*func)(const char *, int))
{
  line_t *list = 0;

  for (; lines != 0; lines = lines->next) {
    char *str = func(lines->str, arg);
    line_t *new_list;

    if (str == 0)
      break;

    if ((new_list = push_line(str, list)) == 0) {
      oops("out of memory\n");
      free(str);
      break;
    }

    list = new_list;
  }

  return nreverse_lines(list);
}

static line_t *tab_munge(line_t *lines, int tabsize)
{
  return filter_lines(lines, tabsize, tab_munge_line);
}

static int smatch(const char *str, const char *bag0, ...)
{
  const char *bag = bag0;
  va_list vl;
  int match = 1;

  va_start (vl, bag0);

  while (bag != 0) {
    if (*str == 0 || strchr(bag, *str) == 0) {
      match = 0;
      break;
    }
    bag = va_arg (vl, const char *);
    str++;
  }

  va_end (vl);

  return match;
}

static long compute_alignment_score(line_t *lines, int tabsize, int shiftwidth)
{
  line_t *next;
  long lineno0, lineno1;
  long score = 0;

#define ALIGN_DBG(STR0_PTR) \
  do { \
    debug("code %s:%d\n", __FILE__, __LINE__); \
    debug("lines: %ld,%ld\n", lineno0, lineno1); \
    debug("line0: %s\n", lines->str); \
    debug("line1: %s\n", next->str); \
    debug("pos:   %*s\n", (int) ((STR0_PTR) - lines->str) + 1, "^"); \
  } while (0)

  /* Loop over pairs of lines. After each iteration, the
     second one of the pair becomes the first line in the next pair.
     The pair of lines are not always consecutive. */
  for (next = lines ? lines->next : 0, lineno0 = 1, lineno1 = lineno0 + 1;
       lines && next;
       lines = next, next = next ? next->next : 0, lineno0 = lineno1++)
  {
    char *str0 = lines->str;
    size_t len0 = strlen(str0);
    long tnd0 = strspn(str0, LETAB);    /* leading space generated by tabs */
    long ind0 = strspn(str0, ANYSP);    /* indentation */
    long int0 = strcspn(str0, INTAB);   /* position of first inner tab */

    if (len0 == ind0) {
      /* First of the two lines is blank, or pure indentation. Next! */
      continue;
    }

    /* This inner loop just bails if it reaches the end of its body,
       but if it is continued, it scans to choose a different second
       line while maintaining the same first line, but not too far!
       The reason for this is that alignment sometimes occurs between
       lines which are not consecutive. There can be blank lines,
       or "out of band" things like preprocessor directives in the
       C language. If we can't find a viable next line within a
       few lines, we skip the whole junk, and continue with a new pair. */
    for (; next && lineno1 - lineno0 <= 4; next = next->next, lineno1++)
    {
      char *str1 = next->str;
      size_t len1 = strlen(str1);
      long tnd1 = strspn(str1, LETAB);
      long ind1 = strspn(str1, ANYSP);
      long int1 = strcspn(str1, INTAB);

      if (len1 == ind1) {
        /* Second line is blank or pure indentation; choose
           the next line to be used as second. */
        continue;
      }

      /* There is usually no alignment when the next line de-indents. */
      if (ind1 < ind0) {
        if (ind1 == 0 || ind0 - ind1 > shiftwidth) {
          /* If the de-indent is by more than a shiftwidth, or goes
             all the way to column zero, choose another second line. */
          continue;
        }
        /* Else skip to next pair. */
        break;
      }

      /* If second line is indented more, but has less tabbed indentation,
         that's probably indicates too small tab size, or odd formatting.
         Better not look for alignment with this pair.*/
      if (tnd1 < tnd0)
        break;

      /* Second line indents beyond first line. There are couple
         of interesting cases here, when the first line opens
         an indented group. */
      if (ind1 >= len0) {
        if (len0 && strchr(OPGRP, str0[len0-1]) &&
            ind1 != tnd1 && ind0 != tnd0) {
          /* First line ends with opening parenthesis, bracket or brace. */
          if (ind1 == len0 && ind0 < len0 - 1) {
            score += 2;
            ALIGN_DBG(str0 + ind1);
          }
          if (ind1 == len0 - 1 + shiftwidth) {
            score += 4;
            ALIGN_DBG(str0 + ind1);
          }
        }
        break;
      }

      if (tnd0 != tnd1) {
        long fsp0 = ind0 + strcspn(str0 + ind0, ANYSP);
        long fit0 = fsp0 + strspn(str0 + fsp0, ANYSP);

        /* If lines differ in leading tabs, then we do leading align check. */
        if (ind0 % shiftwidth == 0 && ind1 == ind0 + shiftwidth) {
          /* An indentation from a shiftwidth-aligned position by
             one shiftwidth contributes a small score. */
          /* ALIGN_DBG(str0 + ind0); */
          score += 1;
        }
        if (ind0 == ind1) {
          /* Small score if lines are indented the same, but
             with different combination of tabs and spaces;
             this sometimes occurs when code is edited by different
             people who agree on the indentation, but disagree
             on the use of hard tabs versus spaces. */
          ALIGN_DBG(str0 + ind1);
          score += 2;
        }
        if (ind0 != ind1 && ind1 != tnd1 &&
            !(ind1 - ind0 <= shiftwidth && ind1 % shiftwidth == 0)) {
          /* If indent of second line is different from first,
             does not consist of all tabs, and is not a multiple of the
             shiftwidth that is within a shiftwidth of the prior line,
             it may be a leading alignment. */
          if (len0 && strchr(OPGRP, str0[len0 - 1])) {
            /* Any indentation in the line following a line which
               ends with group opener is probably just indentation,
               with no alignment. */
            break;
          }
          if (ind1 == fit0) {
            /* Indented line aligned with space-delimited item,
               which is the first such item past the indentation.
               This is weak if it occurs within a shiftwidth. */
            ALIGN_DBG(str0 + fit0);
            score += (ind1 - ind0) > shiftwidth ? 6 : 2;
          }
          if (strchr(OPGRP, str0[ind1 - 1]) && (strchr(NONSP, str0[ind1]))) {
            /* Indented line aligned with nonspace following opening
               bracket or brace. */
            ALIGN_DBG(str0 + ind1);
            score += (ind1 - ind0) > shiftwidth ? 6 : 2;
          }
        }
        /* Now look for inner alignments due to offsets caused by
           leading tabs, or inner tabs. */
        str0 += ind1;
        str1 += ind1;
      } else if (int0 == len0 && int1 == len1) {
        /* If lines do not differ in leading tabs, and have no internal
           tabs, we are done. */
        break;
      } else {
        /* Lines do not differ in leading tabs, but have internal tabs;
           so we can advance to the first discrepancy in inner tabbing,
           and look for alignments after that. */
        str0 += ind1;
        str1 += ind1;

        for (;;) {
          {
            size_t itcsp0 = strcspn(str0, INTAB);
            size_t itcsp1 = strcspn(str1, INTAB);

            str0 += MIN(itcsp0, itcsp1);
            str1 += MIN(itcsp0, itcsp1);

            if (itcsp0 != itcsp1)
              break;
          }

          {
            size_t itsp0 = strspn(str0, INTAB);
            size_t itsp1 = strspn(str1, INTAB);

            str0 += MIN(itsp0, itsp1);
            str1 += MIN(itsp0, itsp1);

            if (itsp0 != itsp1 || itsp0 == 0)
              break;
          }
          score += 2;
          ALIGN_DBG(str0);
        }
      }

      for (; *str0 && *str1; str0++, str1++) {
        if (str0[0] == INTAB[0] && strchr(NONSP, str0[1])) {
          /* Alignment of non-space elements by inner tabs is strong indicator. */
          if (str1[0] == INTAB[0] && strchr(NONSP, str1[1])) {
            ALIGN_DBG(str0);
            score += 4;
            break;
          }
        } else if (!strncmp(str0, INTAB SPACE, 2)) {
          /* Alignment of spaces with leading tab is a weak indicator. */
          if (!strncmp(str1, INTAB SPACE, 2)) {
            ALIGN_DBG(str0);
            score += 2;
            break;
          }
        } else if (smatch(str0, SPACE, SPACE, NONSP, (const char *) 0)) {
          /* Alignment of material preceded by two or more blanks
            (achieved by either tabs or spaces). */
          if (smatch(str1, SPACE INTAB, SPACE INTAB, NONSP, (const char *) 0)) {
            ALIGN_DBG(str0);
            score += 4;
            break;
          }
        }
      }

      break;
    }
  }

  return score;
#undef ALIGN_DBG
}

static int determine_shiftwidth(line_t *lines_in, int tabsize, int munged)
{
  line_t *lines = (munged) ? lines_in : tab_munge(lines_in, tabsize);
  long max_hist = 0, indent_hist[9] = { 0 }, zerocol_hist[9] = { 0 };
  int i, shiftwidth = 0;

  if (lines == 0)
    return 0;

  for (; lines && lines->next; lines = lines->next) {
    char *str0 = lines->str;
    char *str1 = lines->next->str;
    long ind0 = strspn(str0, ANYSP);
    long ind1 = strspn(str1, ANYSP);
    long move = labs(ind1 - ind0);

    /* Empty lines tell us nothing. */
    if (strlen(str0 + ind0) == 0 || strlen(str1 + ind1) == 0)
      continue;

    /* Only a move by an amount which is a divisor of the starting
       position is nesting-level indentation.
       This criterion filters out a lot of indentation which
       exists for alignment with something in the prior line,
       rather than for increased block nesting */
    if (move >= 2 && move <= 8) {
      if (ind0 && ind1) {
        if (ind0 % move == 0)
          indent_hist[move]++;
      } else if (ind0 < ind1) {
        /* Indentation out of column zero tabulated separately.
           Consider only if second line goes beyond previous line,
           or if the indentation does not suspiciously look like alignment. */
        if (strlen(str0) < ind1 ||
            !(strchr(ANYSP, str0[ind1 - 1]) != 0 &&
              strchr(ANYSP, str0[ind1]) == 0))
          zerocol_hist[move]++;
      }
    }
  }

  for (i = 2; i <= 8; i++) {
    debug("hist[%d] = %ld\n", i, indent_hist[i]);
    if (indent_hist[i] >= max_hist) {
      max_hist = indent_hist[i];
      shiftwidth = i;
    }
  }

  /* The indent_history turned up nothing; maybe the file
     only has one level of indentation, with no deeper nesting. */
  if (max_hist == 0) {
    for (i = 2; i <= 8; i++) {
      debug("zerocol[%d] = %ld\n", i, zerocol_hist[i]);
      if (zerocol_hist[i] >= max_hist) {
        max_hist = zerocol_hist[i];
        shiftwidth = i;
      }
    }
  }

  free_lines(munged ? 0 : lines);
  return shiftwidth;
}

static int determine_tabsize(line_t *lines)
{
  int tabsize;
  long best_score = -1;
  int best_tabsize = 0;

  for (tabsize = 2; tabsize <= 8; tabsize++) {
    line_t *lines_retabbed = tab_munge(lines, tabsize);
    long score;
    int shiftwidth;

    if (lines_retabbed == 0)
      return 0;

    shiftwidth = determine_shiftwidth(lines_retabbed, tabsize, 1);

    score = compute_alignment_score(lines_retabbed, tabsize, shiftwidth);

    /* Scores for common tab sizes get a boost; 8 a bit more than 4. */
    switch (tabsize) {
    case 4: score += 1 + score / 16; break;
    case 8: score += 1 + score / 8; break;
    }

    if (score >= best_score) {
      best_tabsize = tabsize;
      best_score = score;
    }

    free_lines(lines_retabbed);
    debug("score[%d] = %ld\n", tabsize, score);
  }

  return best_tabsize;
}

int determine_expandtab(line_t *lines_in, int tabsize, int shiftwidth)
{
  line_t *lines = tab_munge(lines_in, tabsize);
  int indented = 0, tabbed = 0;

  for (; lines; lines = lines->next) {
    char *str = lines->str;
    long ind = strspn(str, ANYSP);

    /* Count indented lines which require at least one tab,
     * and count how many of these include a tab in that
     * indentation.
     */
    if (ind % shiftwidth == 0 && ind >= tabsize) {
      char *tab = strpbrk(str, INTAB LETAB);

      indented++;
      if (!tab)
        continue;
      if (tab - str > ind)
        continue;
      tabbed++;
    }
  }

  free_lines(lines);

  /* If 25% or fewer of the indented lines which should
   * have tabs actually have tabs, then let's turn
   * on expandtab mode.
   */
  return (tabbed <= indented / 4) ? 1 : 0;
}

int main(int argc, char **argv)
{
  line_t *lines;
  int tabsize = 8, expandtabs = 1, shiftwidth = 8;
  int ret = EXIT_FAILURE;

  if (argc > 1) {
    if (!strcmp(argv[1], "-d")) {
      debug_enabled = 1;
    } else if (!strcmp(argv[1], "--version")) {
      printf("Autotab %d\n", AUTOTAB_VER);
      return EXIT_SUCCESS;
    } else {
      fputs("invalid argument\n", stderr);
      return EXIT_FAILURE;
    }
  }

  if ((lines = snarf_lines(stdin)) == 0)
    goto out_default;

  if (fgrep(lines, "\t")) {
    expandtabs = 0;
    if ((tabsize = determine_tabsize(lines)) == 0)
      goto out;
  }

  if ((shiftwidth = determine_shiftwidth(lines, tabsize, 0)) == 0)
    goto out;

  if (!expandtabs)
    expandtabs = determine_expandtab(lines, tabsize, shiftwidth);

out_default:
  printf("tabstop=%d shiftwidth=%d %sexpandtab\n", tabsize, shiftwidth,
         expandtabs ? "" : "no");
  ret = 0;

out:
  free_lines(lines);
  return ret;
}