Whamcloud - gitweb
EX-6792 tests: Added support of UTF-8 in lipe_find3
authorAlexandre Ioffe <aioffe@ddn.com>
Mon, 20 Feb 2023 04:50:07 +0000 (20:50 -0800)
committerAndreas Dilger <adilger@whamcloud.com>
Fri, 10 Mar 2023 19:51:00 +0000 (19:51 +0000)
- Set locale UTF-8 before running shell command
- Use %s format when SCM script is produced to protect
UTF-8 symbols
- Use fnmatch when compare UTF-8 strings with no
case sensitivity
- Added test for UTF-8 file name test and path test
- Added test name in path name in test_109

Test-Parameters: trivial testlist=sanity-lipe-scan3,sanity-lipe-find3,sanityn
Signed-off-by: Alexandre Ioffe <aioffe@ddn.com>
Change-Id: I6c311ac0b2725f5414a5828d5577343ac8a22dee
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/50066
Reviewed-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lipe/src/lipe_find3/lf3_parse.y
lipe/src/lipe_scan3/ls3_main.c
lustre/tests/sanity-lipe-find3.sh
lustre/tests/test-framework.sh

index a5a6750..0aee9d7 100644 (file)
@@ -643,8 +643,18 @@ static char *lf3_type_expr(int begin, int end)
 
 static const char LF3_FNMATCH[] = "fnmatch?";
 static const char LF3_FNMATCH_CI[] = "fnmatch-ci?";
-static const char LF3_STRING_EQ[] = "string=?";
-static const char LF3_STRING_CI_EQ[] = "string-ci=?";
+static const char LF3_STRING_EQ[] = "streq?";
+static const char LF3_STRING_CI_EQ[] = "streq-ci?";
+
+/* If not ascii string assume it is UTF-8 */
+static int is_str_ascii(const char *str)
+{
+       int i;
+       for(i=0; str[i] != 0; i++)
+               if (!isascii(str[i]))
+                       return 0;
+       return 1;
+}
 
 static char *lf3_fnmatch_expr(const char *which /* LF3_FNMATCH, LF3_FNMATCH_CI */,
                             const char *pattern,
@@ -652,6 +662,7 @@ static char *lf3_fnmatch_expr(const char *which /* LF3_FNMATCH, LF3_FNMATCH_CI *
 {
        const char *p_str;
        const char *v_match;
+       const char *utf8_flag = " ";
 
        /* From fnmatch(3): the fnmatch() function checks whether the
         * string argument matches the pattern argument, which is a
@@ -666,22 +677,25 @@ static char *lf3_fnmatch_expr(const char *which /* LF3_FNMATCH, LF3_FNMATCH_CI *
                /* pattern is a plain string. Do strength reduction. */
                if (strcmp(which, LF3_FNMATCH) == 0)
                        which = LF3_STRING_EQ;
-               else if (strcmp(which, LF3_FNMATCH_CI) == 0)
+               else if (strcmp(which, LF3_FNMATCH_CI) == 0) {
                        which = LF3_STRING_CI_EQ;
+                       if (!is_str_ascii(pattern))
+                               utf8_flag = " 1";
+               }
        }
 
        /* We want something equivalent to the following but without
-        * creating a closure for each inode:
+        * creating a closure for each inode (flag is optional):
         * (let ((match?
         *        (lambda (str)
-        *          (which pattern str))))
+        *          (which pattern str flag))))
         *   (any match? (thunk)))
         */
 
        p_str = LF3_GENSYM("str");
        v_match = LF3_GENVARF("match",
-                            "(lambda (%s) (%s %Q %s))",
-                            p_str, which, pattern, p_str);
+                            "(lambda (%s) (%s \"%s\" %s%s))",
+                            p_str, which, pattern, p_str, utf8_flag);
 
        return xsprintf("(%s %s)", caller, v_match);
 }
index 7424018..6b303ff 100644 (file)
@@ -22,6 +22,7 @@
 #include <lustre/lustreapi.h>
 #include <json-c/json.h>
 #include <libguile.h>
+#include <locale.h>
 #include "lipe_version.h"
 #include "list.h"
 #include "ls3_debug.h"
@@ -318,7 +319,8 @@ SCM_DEFINE(ls3_scm_fnmatch_p, "fnmatch?", 2, 1, 0,
        int rc;
 
        SCM_VALIDATE_STRING(1, pattern);
-       c_pattern = scm_to_latin1_string(pattern);
+       c_pattern = scm_to_utf8_string(pattern);
+
        SCM_VALIDATE_STRING(2, string);
        c_string = scm_to_latin1_string(string);
        if (SCM_UNBNDP(flags))
@@ -335,6 +337,71 @@ SCM_DEFINE(ls3_scm_fnmatch_p, "fnmatch?", 2, 1, 0,
 }
 #undef FUNC_NAME
 
+SCM_DEFINE(ls3_scm_streq_p, "streq?", 2, 1, 0,
+          (SCM pattern, SCM string, SCM flags), "match two filenames")
+#define FUNC_NAME s_ls3_scm_streq_p
+{
+       char *c_pattern = NULL;
+       char *c_string = NULL;
+       int rc;
+
+       SCM_VALIDATE_STRING(1, pattern);
+       c_pattern = scm_to_utf8_string(pattern);
+
+       SCM_VALIDATE_STRING(2, string);
+       c_string = scm_to_latin1_string(string);
+
+       rc = strcmp(c_pattern, c_string);
+
+       free(c_pattern);
+       free(c_string);
+
+       return rc == 0 ? SCM_BOOL_T : SCM_BOOL_F;
+}
+#undef FUNC_NAME
+
+static int is_str_ascii(const char *str)
+{
+       int i;
+
+       for (i = 0; str[i] != 0; i++)
+               if (!isascii(str[i]))
+                       return 0;
+       return 1;
+}
+
+SCM_DEFINE(ls3_scm_streq_ci_p, "streq-ci?", 2, 1, 0,
+          (SCM pattern, SCM string, SCM utf8_flag), "compare two file names")
+#define FUNC_NAME s_ls3_scm_streq_ci_p
+{
+       char *c_pattern = NULL;
+       char *c_string = NULL;
+       int c_flags = 0;
+       int rc;
+
+       SCM_VALIDATE_STRING(1, pattern);
+       c_pattern = scm_to_utf8_string(pattern);
+
+       SCM_VALIDATE_STRING(2, string);
+
+       c_string = scm_to_latin1_string(string);
+       if (!SCM_UNBNDP(utf8_flag))
+               SCM_VALIDATE_INT_COPY(3, utf8_flag, c_flags);
+
+       if (c_flags || !is_str_ascii(c_string))
+               /* Non-ASCII pattern or file name. Use fnmatch */
+               rc = fnmatch(c_pattern, c_string, FNM_CASEFOLD);
+       else /* Both strings are ASCII. We can use lib C comparison */
+               rc = strcasecmp(c_pattern, c_string);
+
+       free(c_pattern);
+       free(c_string);
+
+       return rc == 0 ? SCM_BOOL_T : SCM_BOOL_F;
+}
+#undef FUNC_NAME
+
+
 SCM_DEFINE(ls3_scm_debug_enable, "lipe-debug-enable", 0, 1, 0,
           (SCM enable), "get or set debugging")
 {
@@ -1230,6 +1297,8 @@ static void ls3_module_init(void *unused)
        scm_c_export(
        "FNM_CASEFOLD",
        "fnmatch?",
+       "streq?",
+       "streq-ci?",
        "<fid>",
        "fid-seq",
        "fid-oid",
@@ -1453,6 +1522,8 @@ int main(int argc, char *argv[])
 {
        lipe_version_init();
 
+       setlocale(LC_CTYPE, "");
+
        /* scm_boot_guile() does not return. Instead it calls exit(0)
         * when ls3_main_scm() returns. To exit with another
         * status we need to call exit() from ls3_main_scm()
index c4d7b5c..0afcf4d 100644 (file)
@@ -639,7 +639,6 @@ test_108() {
        expect1 "$fid" lipe_find3_facet mds1 -name "$tfile" -print-file-fid
        expect1 "$fid" lipe_find3_facet mds1 -name "$tfile*" -print-file-fid
        expect1 "$fid" lipe_find3_facet mds1 -name "*$tfile" -print-file-fid
-       expect1 "$fid" lipe_find3_facet mds1 -name "$tfile*" -print-file-fid
        expect_empty lipe_find3_facet mds1 -name "dagobert.txt"
 
        mv "$file" "$MOUNT/zalf.x"
@@ -707,7 +706,6 @@ test_109() {
        expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-*2?" -print-file-fid
        expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-?1*" -print-file-fid
 
-
        expect_empty lipe_find3_facet mds1 -path "${tdir}-Z12?"
        expect_empty lipe_find3_facet mds1 -path "${tdir}-Z1*"
        expect_empty lipe_find3_facet mds1 -path "${tdir}-xxx"
@@ -843,6 +841,133 @@ test_113() {
 }
 run_test 113 "lipe_find3 -mirror-count does the right thing"
 
+test_114() {
+       local utf8="種多語言神經機"              # Chinese glyphs are case insensitive
+       local test_file="${tfile}-${utf8}.txt"
+       local file=$MOUNT/$test_file
+       local fid
+
+       init_lipe_find3_env "$file"
+       fid=$($LFS path2fid "$file")
+
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file"   -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file*"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*$test_file"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*.txt"        -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*種多語言神經?.txt" -print-file-fid
+
+       # Note Chinese glyphs do not have lower case
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file"   -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file*"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*$test_file"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*.txt"        -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*多語言神經?.txt"  -print-file-fid
+
+       utf8="가나다"                        # Korean glyphs are case insensitive
+       test_file="${tfile}-${utf8}.txt"
+       mv "$file" "$MOUNT/$test_file"
+       file="$MOUNT/$test_file"
+
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*.txt"       -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*나?.txt"     -print-file-fid
+
+       # Note Korean glyphs do not have lower case
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*.txt"       -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*나?.txt"     -print-file-fid
+
+       utf8="ÖŞn"                                    # Turkish
+       test_file="${tfile}-${utf8}.txt"
+       mv "$file" "$MOUNT/$test_file"
+       file="$MOUNT/$test_file"
+
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*.txt"       -print-file-fid
+
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*.txt"       -print-file-fid
+
+       utf8="öşn"                            # Turkish small letters
+       test_file="${tfile}-${utf8}.txt"
+
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*ş*.txt"    -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*Ş*"        -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file" -print-file-fid
+
+       utf8="ßäü"                           # German small letter
+       test_file="${tfile}-${utf8}.txt"
+       mv "$file" "$MOUNT/$test_file"
+       file="$MOUNT/$test_file"
+
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "$test_file*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*ü.txt" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*ß?ü.txt" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -name "*.txt" -print-file-fid
+
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*$test_file" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*.txt" -print-file-fid
+
+       utf8="ẞäÜ"                          # German upper and small letters
+       test_file="${tfile}-${utf8}.txt"
+       expect1 "$fid" lipe_find3_facet mds1 -iname "*Ü.txt" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -iname "$test_file" -print-file-fid
+
+       init_lipe_find3_env
+}
+run_test 114 "lipe_find3 -name and -iname with UTF-8"
+
+test_115() {
+       local td="${tdir}-多言神經-가나다-ßäü"
+       local dir="${MOUNT}/${td}"
+       local fid
+
+       init_lipe_find3_env
+
+       lfs mkdir -i 0 $dir
+       sync_all_data_and_delay
+
+       fid=$($LFS path2fid "$dir")
+
+       expect1 "$fid" lipe_find3_facet mds1 -path "${td}"   -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "${td}*"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "*${td}"  -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "*${td}*" -print-file-fid
+
+       expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-多言神經-가나다-ßä*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-*言神經-가나다-ßäü" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "*言神經-가나다-ßä*"         -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-多*ü"          -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-多言神經-가나다-ß?ü" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-多言神經-가나다-ßä?" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-?言神經-가나다-ßäü" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "${tdir}-?言神經-가나다-ß*ü" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -path "*言神經-가나다-ßä?"         -print-file-fid
+
+       expect_empty lipe_find3_facet mds1 -path "${tdir}-多言神經-가나다-ßäÜ"
+       expect_empty lipe_find3_facet mds1 -path "${tdir}-*Ü"
+
+       expect1 "$fid" lipe_find3_facet mds1 -ipath "${td}" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -ipath "${tdir}-多言神經-가나다-ẞ?Ü*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -ipath "*ẞä*" -print-file-fid
+       expect1 "$fid" lipe_find3_facet mds1 -ipath "*Ü"   -print-file-fid
+
+       init_lipe_find3_env
+}
+run_test 115 "lipe_find3 -path and -ipath with UTF-8"
+
 test_130() {
        local file=$MOUNT/$tfile
        local xtime
@@ -978,8 +1103,7 @@ test_300() {
        expect1 4815162342 lipe_find3_facet mds1 -exec cat {} \;
 
        # $ find /mnt/lustre -exec zalp {} \;
-       # find: ‘zalp’: No such file or directory
-       # find: ‘zalp’: No such file or directory
+       # find: "zalp": No such file or directory
        # $ echo $?
        # 0
 
index 8abd669..64b3ce2 100755 (executable)
@@ -4147,8 +4147,9 @@ do_node_vp() {
        local host="$1"
        shift
 
+       # *Important* to set LC_CTYPE to be able to test UTF-8 symbols
        if [[ "$host" == "$HOSTNAME" ]]; then
-               sh -c "$(printf -- ' %q' "$@")"
+               sh -c "export \"LC_CTYPE=en_US.UTF-8\"; $(printf -- ' %q' "$@")"
                return $?
        fi
 
@@ -4159,7 +4160,7 @@ do_node_vp() {
 
        # -N Disable hostname: prefix on lines of output.
 
-       $PDSH "${host}" -N "cd $RPWD; PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; export LUSTRE=$RLUSTRE; $(printf -- ' %q' "$@")"
+       $PDSH "${host}" -N "export \"LC_CTYPE=en_US.UTF-8\"; cd $RPWD; PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; export LUSTRE=$RLUSTRE; $(printf -- ' %q' "$@")"
 }
 
 single_local_node () {