From 61269e2fc2682670256a33a24c2130a10fe6b700 Mon Sep 17 00:00:00 2001 From: David Bigagli <david@schedmd.com> Date: Wed, 15 Jan 2014 14:34:56 -0800 Subject: [PATCH] Modify the reading of lustre file system statistics to print more information when debug and when io error occur. --- NEWS | 2 + .../lustre/acct_gather_filesystem_lustre.c | 122 +++++++++++------- 2 files changed, 78 insertions(+), 46 deletions(-) diff --git a/NEWS b/NEWS index be0b680c732..3e2d997274f 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,8 @@ documents those changes that are of interest to users and admins. -- Modified 'sacctmgr show associations' command to show GrpCPURunMins by default. -- Replace the hostlist_push() function with a more efficient hostlist_push_host(). + -- Modify the reading of lustre file system statistics to print more information + when debug and when io error occur. * Changes in Slurm 14.03.0pre5 ============================== diff --git a/src/plugins/acct_gather_filesystem/lustre/acct_gather_filesystem_lustre.c b/src/plugins/acct_gather_filesystem/lustre/acct_gather_filesystem_lustre.c index 6283fe19a66..5e67f51e554 100644 --- a/src/plugins/acct_gather_filesystem/lustre/acct_gather_filesystem_lustre.c +++ b/src/plugins/acct_gather_filesystem/lustre/acct_gather_filesystem_lustre.c @@ -120,7 +120,7 @@ static uint32_t debug_flags = 0; static pthread_mutex_t lustre_lock = PTHREAD_MUTEX_INITIALIZER; /* Default path to lustre stats */ -const char proc_base_path[] = "/proc/fs/lustre/"; +const char proc_base_path[] = "/proc/fs/lustre"; /** * is lustre fs supported @@ -142,8 +142,8 @@ static int _check_lustre_fs(void) sprintf(lustre_directory, "%s/llite", proc_base_path); proc_dir = opendir(proc_base_path); if (!proc_dir) { - debug2("not able to read %s", - lustre_directory); + error("%s: not able to read %s %m", + __func__, lustre_directory); rc = SLURM_FAILURE; } else { closedir(proc_dir); @@ -155,8 +155,17 @@ static int _check_lustre_fs(void) return rc; } -/** - * read counters from all mounted lustre fs +/* _read_lustre_counters() + * Read counters from all mounted lustre fs + * from the file stats under the directories: + * + * /proc/fs/lustre/llite/lustre-xxxx + * + * From the file stat we use 2 entries: + * + * read_bytes 17996 samples [bytes] 0 4194304 30994606834 + * write_bytes 9007 samples [bytes] 2 4194304 31008331389 + * */ static int _read_lustre_counters(void ) { @@ -172,60 +181,81 @@ static int _read_lustre_counters(void ) proc_dir = opendir(lustre_dir); if (proc_dir == NULL) { - error("Cannot open %s\n", lustre_dir); + error("%s: Cannot open %s %m", __func__, lustre_dir); return SLURM_FAILURE; } - entry = readdir(proc_dir); + while ((entry = readdir(proc_dir))) { + bool bread; + bool bwrote; + + if (strcmp(entry->d_name, ".") == 0 + || strcmp(entry->d_name, "..") == 0) + continue; - while (entry != NULL) { snprintf(path_stats, PATH_MAX - 1, "%s/%s/stats", lustre_dir, - entry->d_name); - debug3("Found file %s\n", path_stats); + entry->d_name); + debug3("%s: Found file %s", __func__, path_stats); fff = fopen(path_stats, "r"); - if (fff) { - while(1) { - if (!fgets(buffer,BUFSIZ,fff)) - break; - - if (strstr(buffer, "write_bytes")) { - sscanf(buffer, - "%*s %"PRIu64" %*s %*s " - "%*d %*d %"PRIu64"", - &lustre_se.lustre_nb_writes, - &lustre_se.lustre_write_bytes); - debug3("Lustre Counter " - "%"PRIu64" " - "write_bytes %"PRIu64" " - "writes", - lustre_se.lustre_write_bytes, - lustre_se.lustre_nb_writes); - } - - if (strstr(buffer, "read_bytes")) { - sscanf(buffer, - "%*s %"PRIu64" %*s %*s " - "%*d %*d %"PRIu64"", - &lustre_se.lustre_nb_reads, - &lustre_se.lustre_read_bytes); - debug3("Lustre Counter " - "%"PRIu64" " - "read_bytes %"PRIu64" " - "reads", - lustre_se.lustre_read_bytes, - lustre_se.lustre_nb_reads); - } + if (fff == NULL) { + error("%s: Cannot open %s %m", __func__, path_stats); + continue; + } + + bread = bwrote = false; + while (fgets(buffer, BUFSIZ, fff)) { + + if (bread && bwrote) + break; + + if (strstr(buffer, "write_bytes")) { + sscanf(buffer, + "%*s %"PRIu64" %*s %*s " + "%*d %*d %"PRIu64"", + &lustre_se.lustre_nb_writes, + &lustre_se.lustre_write_bytes); + debug3("%s " + "%"PRIu64" " + "write_bytes %"PRIu64" " + "writes", + __func__, + lustre_se.lustre_write_bytes, + lustre_se.lustre_nb_writes); + bwrote = true; + } + + if (strstr(buffer, "read_bytes")) { + sscanf(buffer, + "%*s %"PRIu64" %*s %*s " + "%*d %*d %"PRIu64"", + &lustre_se.lustre_nb_reads, + &lustre_se.lustre_read_bytes); + debug3("%s " + "%"PRIu64" " + "read_bytes %"PRIu64" " + "reads", + __func__, + lustre_se.lustre_read_bytes, + lustre_se.lustre_nb_reads); + bread = true; } - fclose(fff); } - entry = readdir(proc_dir); + fclose(fff); + lustre_se.all_lustre_write_bytes += lustre_se.lustre_write_bytes; lustre_se.all_lustre_read_bytes += lustre_se.lustre_read_bytes; lustre_se.all_lustre_nb_writes += lustre_se.lustre_nb_writes; lustre_se.all_lustre_nb_reads += lustre_se.lustre_nb_reads; - } + debug3("%s: all_lustre_write_bytes %lu all_lustre_read_bytes %lu ", + __func__, lustre_se.all_lustre_write_bytes, + lustre_se.all_lustre_read_bytes); + debug3("%s: all_lustre_nb_writes %lu all_lustre_nb_reads %lu", + __func__, lustre_se.all_lustre_nb_writes, + lustre_se.all_lustre_nb_reads); + + } /* while ((entry = readdir(proc_dir))) */ closedir(proc_dir); lustre_se.last_update_time = lustre_se.update_time; @@ -258,7 +288,7 @@ static int _update_node_filesystem(void) fls->write_size = (double) lustre_se.all_lustre_write_bytes / 1048576; acct_gather_profile_g_add_sample_data(ACCT_GATHER_PROFILE_LUSTRE, fls); - debug3("Collection of Lustre counters Finished"); + debug3("%s: Collection of Lustre counters Finished", __func__); xfree(fls); -- GitLab