In response to this idea for clibs C package manager Filter packages by tags and features. E.g. Is this package heapless? #322, I created this C source scanner to scan C source code and generate keyword suggestions based on the code's properties of relevance for people browsing clib package list. Such as if the source code utilize dynamic memory (malloc, free, realloc)?


How It Works

The scanner works as follows:

  1. Comment and String Removal:

    • The tool handles single-line (//), multi-line (/* ... */) comments and (' ... ') single quoted or (" ... ") double quoted string to ensure only meaningful tokens are processed.
  2. Token Scanning:

    • Tokens are extracted from the source file, with special attention to function calls.
  3. Keyword Detection:

    • The scanner checks for specific keywords and maintains a list of detected features of relevance to users.
  4. Tag Emission:

    • Based on the detected features, the scanner outputs a JSON-like list of tags to describe the source code.

Example Output

Detect heap use and goto usage

Input with stderr output disabled 2>&-:

tcc -run ./source_keyword_suggester.c 2>&- << HEREDOC
#include <stdlib.h>
int main() {
    int *ptr = malloc(100);
    free(ptr);
    goto end;
end:
    return 0;
}
HEREDOC

Output:

["heap used", "malloc", "free", "goto used"]

Show tokens detected

This shows how the scanner ignores certain tokens and find certain tokens

Input with stdout output disabled 1>&-:

tcc -run ./source_keyword_suggester.c 1>&- << HEREDOC
#include <stdlib.h>
int main() {
    int *ptr = malloc(100);
    free(ptr);
    goto end;
end:
    return 0;
}
HEREDOC

Output:

include
stdlib.h
int
main()
int
ptr
malloc()
free()
ptr
goto
end
end
return

Source

/*
    Clibs Source Scanner For Keyword Suggestion
    Author: Brian Khuu (2025)

    This idea is to provide a mechanism for scanning a source code and
    giving some keyword suggestions based on the properties of the
    source code. In this case, I would like to mark a source code as
    using dynamic memory or not. This won't be perfect, but it may help
    encourage library writers to use keywords if it's already provided.

    # MIT License

    Copyright (c) 2025 Brian Khuu

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE.

 */

#include <stdbool.h>
#include <stdio.h>
#include <string.h>

#define TOKEN_MAX 256

typedef struct SourceTokenScanner
{
    int prev;
    int quote_char;
    bool in_single_line_comment;
    bool in_multi_line_comment;

    bool token_ready;
    bool token_is_function;
    char token[TOKEN_MAX + 1];
    size_t token_size;
} SourceTokenScanner;

typedef struct SourceTagger
{
    bool uses_malloc;
    bool uses_free;
    bool uses_realloc;
    bool uses_goto;
} SourceTagger;

bool sourceTokenScanner(SourceTokenScanner *context, const char ch)
{
    if (context->token_ready)
    {
        context->token[0] = '\0';
        context->token_size = 0;
        context->token_ready = false;
    }

    if (ch == '\'' || ch == '"' || context->quote_char)
    {
        if (context->quote_char)
        {
            if (ch == context->quote_char && context->prev != '\\')
            {
                context->quote_char = '\0';
            }
            else
            {
                context->prev = ch;
            }
        }
        else
        {
            context->quote_char = ch;
            context->prev = 0;
            context->token[0] = '\0';
            context->token_size = 0;
        }
        return false;
    }

    if (context->prev == '/' && ch == '/' || context->in_single_line_comment)
    {
        if (context->in_single_line_comment)
        {
            if (ch == '\n')
            {
                context->in_single_line_comment = false;
            }
        }
        else
        {
            context->in_single_line_comment = true;
            context->token[0] = '\0';
            context->token_size = 0;
        }
        return false;
    }

    if (context->prev == '/' && ch == '*' || context->in_multi_line_comment)
    {
        if (context->in_multi_line_comment)
        {
            if (context->prev == '*' && ch == '/')
            {
                context->in_multi_line_comment = false;
                context->prev = '\0';
            }
            else
            {
                context->prev = ch;
            }
        }
        else
        {
            context->in_multi_line_comment = true;
            context->prev = 0;
            context->token[0] = '\0';
            context->token_size = 0;
        }
        return false;
    }

    context->prev = ch;

    if (('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || ('0' <= ch && ch <= '9') || (ch == '_') || (ch == '.'))
    {
        if (context->token_size < TOKEN_MAX)
        {
            context->token[context->token_size++] = ch;
            context->token[context->token_size] = '\0';
        }
        return false;
    }

    if (context->token_size > 0)
    {
        if (context->token_size <= 2 || ('0' <= context->token[0] && context->token[0] <= '9') || (context->token[0] == '.'))
        {
            // Exclude short tokens or invalid starting characters
            // Dev Note: Shortest function name is 3 characters long (e.g. sin, cos, tan, log etc...)
            context->token[0] = '\0';
            context->token_size = 0;
            return false;
        }
        else
        {
            // Heuristic Token Found
            context->token_is_function = (ch == '(') ? true : false;
            context->token_ready = true;
            return true;
        }
    }

    return false;
}

void add_tags(char **tags, size_t *tag_count, size_t max_tags, char *tag)
{
    // Check if tag already added
    for (int i = 0; i < max_tags; i++)
    {
        if (tags[i] != NULL && (strcmp(tags[i], tag) == 0))
        {
            return;
        }
    }

    // Add tag
    if (*tag_count < max_tags)
    {
        tags[*tag_count] = tag;
        *tag_count = *tag_count + 1;
    }
}

int main()
{
    char ch;
    SourceTokenScanner sourceTokenScannerState = {0};
    SourceTagger sourceTagger = {0};

    /* Scan Source For Indicator Tokens */
    while ((ch = getchar()) != EOF)
    {
        if (sourceTokenScanner(&sourceTokenScannerState, ch))
        {
            if (sourceTokenScannerState.token_is_function)
            {
                // Function
                if (strcmp(sourceTokenScannerState.token, "malloc") == 0)
                {
                    sourceTagger.uses_malloc = true;
                }
                else if (strcmp(sourceTokenScannerState.token, "free") == 0)
                {
                    sourceTagger.uses_free = true;
                }
                else if (strcmp(sourceTokenScannerState.token, "realloc") == 0)
                {
                    sourceTagger.uses_realloc = true;
                }
            }
            else
            {
                if (strcmp(sourceTokenScannerState.token, "goto") == 0)
                {
                    sourceTagger.uses_goto = true;
                }
            }
            fprintf(stderr, "%s%s\n", sourceTokenScannerState.token, sourceTokenScannerState.token_is_function ? "()" : "");
        }
    }

    /* Generate Tag Suggestions */
    char *tags[100] = {NULL};
    size_t tag_count = 0;
    if (!sourceTagger.uses_malloc && !sourceTagger.uses_free && !sourceTagger.uses_realloc)
    {
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "no heap");
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "heapless");
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "no malloc");
    }

    if (sourceTagger.uses_malloc)
    {
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "heap used");
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "malloc");
    }

    if (sourceTagger.uses_free)
    {
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "heap used");
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "free");
    }

    if (sourceTagger.uses_realloc)
    {
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "heap used");
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "realloc");
    }

    if (sourceTagger.uses_goto)
    {
        add_tags(tags, &tag_count, (sizeof(tags) / sizeof(tags[0])), "goto used");
    }

    /* Print Tag Suggestions */
    printf("[");
    for (int i = 0; i < tag_count; i++)
    {
        if (i > 0)
        {
            printf(", ");
        }
        printf("\"%s\"", tags[i]);
    }
    printf("]");

    return 0;
}