Logo Search packages:      
Sourcecode: filezilla version File versions

wildcard.c

/*
 * Wildcard matching engine for use with SFTP-based file transfer
 * programs (PSFTP, new-look PSCP): since SFTP has no notion of
 * getting the remote side to do globbing (and rightly so) we have
 * to do it locally, by retrieving all the filenames in a directory
 * and checking each against the wildcard pattern.
 */

#include <assert.h>
#include <stdlib.h>
#include <string.h>

#include "putty.h"

/*
 * Definition of wildcard syntax:
 * 
 *  - * matches any sequence of characters, including zero.
 *  - ? matches exactly one character which can be anything.
 *  - [abc] matches exactly one character which is a, b or c.
 *  - [a-f] matches anything from a through f.
 *  - [^a-f] matches anything _except_ a through f.
 *  - [-_] matches - or _; [^-_] matches anything else. (The - is
 *    non-special if it occurs immediately after the opening
 *    bracket or ^.)
 *  - [a^] matches an a or a ^. (The ^ is non-special if it does
 *    _not_ occur immediately after the opening bracket.)
 *  - \*, \?, \[, \], \\ match the single characters *, ?, [, ], \.
 *  - All other characters are non-special and match themselves.
 */

/*
 * Some notes on differences from POSIX globs (IEEE Std 1003.1, 2003 ed.):
 *  - backslashes act as escapes even within [] bracket expressions
 *  - does not support [!...] for non-matching list (POSIX are weird);
 *    NB POSIX allows [^...] as well via "A bracket expression starting
 *    with an unquoted circumflex character produces unspecified
 *    results". If we wanted to allow [!...] we might want to define
 *    [^!] as having its literal meaning (match '^' or '!').
 *  - none of the scary [[:class:]] stuff, etc
 */

/*
 * The wildcard matching technique we use is very simple and
 * potentially O(N^2) in running time, but I don't anticipate it
 * being that bad in reality (particularly since N will be the size
 * of a filename, which isn't all that much). Perhaps one day, once
 * PuTTY has grown a regexp matcher for some other reason, I might
 * come back and reimplement wildcards by translating them into
 * regexps or directly into NFAs; but for the moment, in the
 * absence of any other need for the NFA->DFA translation engine,
 * anything more than the simplest possible wildcard matcher is
 * vast code-size overkill.
 * 
 * Essentially, these wildcards are much simpler than regexps in
 * that they consist of a sequence of rigid fragments (? and [...]
 * can never match more or less than one character) separated by
 * asterisks. It is therefore extremely simple to look at a rigid
 * fragment and determine whether or not it begins at a particular
 * point in the test string; so we can search along the string
 * until we find each fragment, then search for the next. As long
 * as we find each fragment in the _first_ place it occurs, there
 * will never be a danger of having to backpedal and try to find it
 * again somewhere else.
 */

enum {
    WC_TRAILINGBACKSLASH = 1,
    WC_UNCLOSEDCLASS,
    WC_INVALIDRANGE
};

/*
 * Error reporting is done by returning various negative values
 * from the wildcard routines. Passing any such value to wc_error
 * will give a human-readable message.
 */
const char *wc_error(int value)
{
    value = abs(value);
    switch (value) {
      case WC_TRAILINGBACKSLASH:
      return "'\' occurred at end of string (expected another character)";
      case WC_UNCLOSEDCLASS:
      return "expected ']' to close character class";
      case WC_INVALIDRANGE:
      return "character range was not terminated (']' just after '-')";
    }
    return "INTERNAL ERROR: unrecognised wildcard error number";
}

/*
 * This is the routine that tests a target string to see if an
 * initial substring of it matches a fragment. If successful, it
 * returns 1, and advances both `fragment' and `target' past the
 * fragment and matching substring respectively. If unsuccessful it
 * returns zero. If the wildcard fragment suffers a syntax error,
 * it returns <0 and the precise value indexes into wc_error.
 */
static int wc_match_fragment(const char **fragment, const char **target)
{
    const char *f, *t;

    f = *fragment;
    t = *target;
    /*
     * The fragment terminates at either the end of the string, or
     * the first (unescaped) *.
     */
    while (*f && *f != '*' && *t) {
      /*
       * Extract one character from t, and one character's worth
       * of pattern from f, and step along both. Return 0 if they
       * fail to match.
       */
      if (*f == '\\') {
          /*
           * Backslash, which means f[1] is to be treated as a
           * literal character no matter what it is. It may not
           * be the end of the string.
           */
          if (!f[1])
            return -WC_TRAILINGBACKSLASH;   /* error */
          if (f[1] != *t)
            return 0;          /* failed to match */
          f += 2;
      } else if (*f == '?') {
          /*
           * Question mark matches anything.
           */
          f++;
      } else if (*f == '[') {
          int invert = 0;
          int matched = 0;
          /*
           * Open bracket introduces a character class.
           */
          f++;
          if (*f == '^') {
            invert = 1;
            f++;
          }
          while (*f != ']') {
            if (*f == '\\')
                f++;           /* backslashes still work */
            if (!*f)
                return -WC_UNCLOSEDCLASS;   /* error again */
            if (f[1] == '-') {
                int lower, upper, ourchr;
                lower = (unsigned char) *f++;
                f++;           /* eat the minus */
                if (*f == ']')
                  return -WC_INVALIDRANGE;   /* different error! */
                if (*f == '\\')
                  f++;         /* backslashes _still_ work */
                if (!*f)
                  return -WC_UNCLOSEDCLASS;   /* error again */
                upper = (unsigned char) *f++;
                ourchr = (unsigned char) *t;
                if (lower > upper) {
                  int t = lower; lower = upper; upper = t;
                }
                if (ourchr >= lower && ourchr <= upper)
                  matched = 1;
            } else {
                matched |= (*t == *f++);
            }
          }
          if (invert == matched)
            return 0;          /* failed to match character class */
          f++;                 /* eat the ] */
      } else {
          /*
           * Non-special character matches itself.
           */
          if (*f != *t)
            return 0;
          f++;
      }
      /*
       * Now we've done that, increment t past the character we
       * matched.
       */
      t++;
    }
    if (!*f || *f == '*') {
      /*
       * We have reached the end of f without finding a mismatch;
       * so we're done. Update the caller pointers and return 1.
       */
      *fragment = f;
      *target = t;
      return 1;
    }
    /*
     * Otherwise, we must have reached the end of t before we
     * reached the end of f; so we've failed. Return 0. 
     */
    return 0;
}

/*
 * This is the real wildcard matching routine. It returns 1 for a
 * successful match, 0 for an unsuccessful match, and <0 for a
 * syntax error in the wildcard.
 */
int wc_match(const char *wildcard, const char *target)
{
    int ret;

    /*
     * Every time we see a '*' _followed_ by a fragment, we just
     * search along the string for a location at which the fragment
     * matches. The only special case is when we see a fragment
     * right at the start, in which case we just call the matching
     * routine once and give up if it fails.
     */
    if (*wildcard != '*') {
      ret = wc_match_fragment(&wildcard, &target);
      if (ret <= 0)
          return ret;                /* pass back failure or error alike */
    }

    while (*wildcard) {
      assert(*wildcard == '*');
      while (*wildcard == '*')
          wildcard++;

      /*
       * It's possible we've just hit the end of the wildcard
       * after seeing a *, in which case there's no need to
       * bother searching any more because we've won.
       */
      if (!*wildcard)
          return 1;

      /*
       * Now `wildcard' points at the next fragment. So we
       * attempt to match it against `target', and if that fails
       * we increment `target' and try again, and so on. When we
       * find we're about to try matching against the empty
       * string, we give up and return 0.
       */
      ret = 0;
      while (*target) {
          const char *save_w = wildcard, *save_t = target;

          ret = wc_match_fragment(&wildcard, &target);

          if (ret < 0)
            return ret;        /* syntax error */

          if (ret > 0 && !*wildcard && *target) {
            /*
             * Final special case - literally.
             * 
             * This situation arises when we are matching a
             * _terminal_ fragment of the wildcard (that is,
             * there is nothing after it, e.g. "*a"), and it
             * has matched _too early_. For example, matching
             * "*a" against "parka" will match the "a" fragment
             * against the _first_ a, and then (if it weren't
             * for this special case) matching would fail
             * because we're at the end of the wildcard but not
             * at the end of the target string.
             * 
             * In this case what we must do is measure the
             * length of the fragment in the target (which is
             * why we saved `target'), jump straight to that
             * distance from the end of the string using
             * strlen, and match the same fragment again there
             * (which is why we saved `wildcard'). Then we
             * return whatever that operation returns.
             */
            target = save_t + strlen(save_t) - (target - save_t);
            wildcard = save_w;
            return wc_match_fragment(&wildcard, &target);
          }

          if (ret > 0)
            break;
          target++;
      }
      if (ret > 0)
          continue;
      return 0;
    }

    /*
     * If we reach here, it must be because we successfully matched
     * a fragment and then found ourselves right at the end of the
     * wildcard. Hence, we return 1 if and only if we are also
     * right at the end of the target.
     */
    return (*target ? 0 : 1);
}

/*
 * Another utility routine that translates a non-wildcard string
 * into its raw equivalent by removing any escaping backslashes.
 * Expects a target string buffer of anything up to the length of
 * the original wildcard. You can also pass NULL as the output
 * buffer if you're only interested in the return value.
 * 
 * Returns 1 on success, or 0 if a wildcard character was
 * encountered. In the latter case the output string MAY not be
 * zero-terminated and you should not use it for anything!
 */
int wc_unescape(char *output, const char *wildcard)
{
    while (*wildcard) {
      if (*wildcard == '\\') {
          wildcard++;
          /* We are lenient about trailing backslashes in non-wildcards. */
          if (*wildcard) {
            if (output)
                *output++ = *wildcard;
            wildcard++;
          }
      } else if (*wildcard == '*' || *wildcard == '?' ||
               *wildcard == '[' || *wildcard == ']') {
          return 0;                  /* it's a wildcard! */
      } else {
          if (output)
            *output++ = *wildcard;
          wildcard++;
      }
    }
    *output = '\0';
    return 1;                        /* it's clean */
}

#ifdef TESTMODE

struct test {
    const char *wildcard;
    const char *target;
    int expected_result;
};

const struct test fragment_tests[] = {
    /*
     * We exhaustively unit-test the fragment matching routine
     * itself, which should save us the need to test all its
     * intricacies during the full wildcard tests.
     */
    {"abc", "abc", 1},
    {"abc", "abd", 0},
    {"abc", "abcd", 1},
    {"abcd", "abc", 0},
    {"ab[cd]", "abc", 1},
    {"ab[cd]", "abd", 1},
    {"ab[cd]", "abe", 0},
    {"ab[^cd]", "abc", 0},
    {"ab[^cd]", "abd", 0},
    {"ab[^cd]", "abe", 1},
    {"ab\\", "abc", -WC_TRAILINGBACKSLASH},
    {"ab\\*", "ab*", 1},
    {"ab\\?", "ab*", 0},
    {"ab?", "abc", 1},
    {"ab?", "ab", 0},
    {"ab[", "abc", -WC_UNCLOSEDCLASS},
    {"ab[c-", "abb", -WC_UNCLOSEDCLASS},
    {"ab[c-]", "abb", -WC_INVALIDRANGE},
    {"ab[c-e]", "abb", 0},
    {"ab[c-e]", "abc", 1},
    {"ab[c-e]", "abd", 1},
    {"ab[c-e]", "abe", 1},
    {"ab[c-e]", "abf", 0},
    {"ab[e-c]", "abb", 0},
    {"ab[e-c]", "abc", 1},
    {"ab[e-c]", "abd", 1},
    {"ab[e-c]", "abe", 1},
    {"ab[e-c]", "abf", 0},
    {"ab[^c-e]", "abb", 1},
    {"ab[^c-e]", "abc", 0},
    {"ab[^c-e]", "abd", 0},
    {"ab[^c-e]", "abe", 0},
    {"ab[^c-e]", "abf", 1},
    {"ab[^e-c]", "abb", 1},
    {"ab[^e-c]", "abc", 0},
    {"ab[^e-c]", "abd", 0},
    {"ab[^e-c]", "abe", 0},
    {"ab[^e-c]", "abf", 1},
    {"ab[a^]", "aba", 1},
    {"ab[a^]", "ab^", 1},
    {"ab[a^]", "abb", 0},
    {"ab[^a^]", "aba", 0},
    {"ab[^a^]", "ab^", 0},
    {"ab[^a^]", "abb", 1},
    {"ab[-c]", "ab-", 1},
    {"ab[-c]", "abc", 1},
    {"ab[-c]", "abd", 0},
    {"ab[^-c]", "ab-", 0},
    {"ab[^-c]", "abc", 0},
    {"ab[^-c]", "abd", 1},
    {"ab[\\[-\\]]", "abZ", 0},
    {"ab[\\[-\\]]", "ab[", 1},
    {"ab[\\[-\\]]", "ab\\", 1},
    {"ab[\\[-\\]]", "ab]", 1},
    {"ab[\\[-\\]]", "ab^", 0},
    {"ab[^\\[-\\]]", "abZ", 1},
    {"ab[^\\[-\\]]", "ab[", 0},
    {"ab[^\\[-\\]]", "ab\\", 0},
    {"ab[^\\[-\\]]", "ab]", 0},
    {"ab[^\\[-\\]]", "ab^", 1},
    {"ab[a-fA-F]", "aba", 1},
    {"ab[a-fA-F]", "abF", 1},
    {"ab[a-fA-F]", "abZ", 0},
};

const struct test full_tests[] = {
    {"a", "argh", 0},
    {"a", "ba", 0},
    {"a", "a", 1},
    {"a*", "aardvark", 1},
    {"a*", "badger", 0},
    {"*a", "park", 0},
    {"*a", "pArka", 1},
    {"*a", "parka", 1},
    {"*a*", "park", 1},
    {"*a*", "perk", 0},
    {"?b*r?", "abracadabra", 1},
    {"?b*r?", "abracadabr", 0},
    {"?b*r?", "abracadabzr", 0},
};

int main(void)
{
    int i;
    int fails, passes;

    fails = passes = 0;

    for (i = 0; i < sizeof(fragment_tests)/sizeof(*fragment_tests); i++) {
      const char *f, *t;
      int eret, aret;
      f = fragment_tests[i].wildcard;
      t = fragment_tests[i].target;
      eret = fragment_tests[i].expected_result;
      aret = wc_match_fragment(&f, &t);
      if (aret != eret) {
          printf("failed test: /%s/ against /%s/ returned %d not %d\n",
               fragment_tests[i].wildcard, fragment_tests[i].target,
               aret, eret);
          fails++;
      } else
          passes++;
    }

    for (i = 0; i < sizeof(full_tests)/sizeof(*full_tests); i++) {
      const char *f, *t;
      int eret, aret;
      f = full_tests[i].wildcard;
      t = full_tests[i].target;
      eret = full_tests[i].expected_result;
      aret = wc_match(f, t);
      if (aret != eret) {
          printf("failed test: /%s/ against /%s/ returned %d not %d\n",
               full_tests[i].wildcard, full_tests[i].target,
               aret, eret);
          fails++;
      } else
          passes++;
    }

    printf("passed %d, failed %d\n", passes, fails);

    return 0;
}

#endif

Generated by  Doxygen 1.6.0   Back to index