From c4b4319ae85e95057e05ea65a74b6291408262b8 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 3 Oct 2025 03:02:46 +0000 Subject: [PATCH] This commit updates the `TODO.md` and `refactor.md` files to reflect the latest architectural changes. It also removes several temporary and one-off script files to clean up the repository. Key changes: - Marked the "Adopt migrations tool" and "Resolvers call application services only" tasks as complete in `TODO.md`. - Updated the "Unify GraphQL" and "Migrations" sections in `refactor.md` to reflect the completed work. - Removed the following temporary files: - `create_repo_interfaces.go` - `fix_domain_repos.go` - `fix_sql_imports.go` - `report.md` - `validate.py` --- TODO.md | 6 +- create_repo_interfaces.go | 143 --------------------------------- fix_domain_repos.go | 51 ------------ fix_sql_imports.go | 42 ---------- refactor.md | 14 ++-- report.md | 163 -------------------------------------- validate.py | 45 ----------- 7 files changed, 10 insertions(+), 454 deletions(-) delete mode 100644 create_repo_interfaces.go delete mode 100644 fix_domain_repos.go delete mode 100644 fix_sql_imports.go delete mode 100644 report.md delete mode 100644 validate.py diff --git a/TODO.md b/TODO.md index 8170471..629434f 100644 --- a/TODO.md +++ b/TODO.md @@ -6,7 +6,7 @@ - [x] **Complete the Architecture Refactor (High, 5d):** Finalize the transition to a clean, domain-driven architecture. This will significantly improve maintainability, scalability, and developer velocity. - [x] Ensure resolvers call application services only and add dataloaders per aggregate. - - [ ] Adopt a migrations tool and move all SQL to migration files. + - [x] Adopt a migrations tool and move all SQL to migration files. - [ ] Implement full observability with centralized logging, metrics, and tracing. - [x] **Full Test Coverage (High, 5d):** Increase test coverage across the application to ensure stability and prevent regressions. - [x] Write unit tests for all models, repositories, and services. @@ -33,8 +33,8 @@ - [x] `monetization` domain - [x] `search` domain - [x] `work` domain -- [ ] Resolvers call application services only; add dataloaders per aggregate (High, 3d) -- [ ] Adopt migrations tool (goose/atlas/migrate); move SQL to `internal/data/migrations`; delete `migrations.go` (High, 2d) +- [x] Resolvers call application services only; add dataloaders per aggregate (High, 3d) +- [x] Adopt migrations tool (goose/atlas/migrate); move SQL to `internal/data/migrations`; delete `migrations.go` (High, 2d) - [ ] Observability: centralize logging; add Prometheus metrics and OpenTelemetry tracing; request IDs (High, 3d) - [ ] CI: add `make lint test test-integration` and integration tests with Docker compose (High, 2d) diff --git a/create_repo_interfaces.go b/create_repo_interfaces.go deleted file mode 100644 index 620da92..0000000 --- a/create_repo_interfaces.go +++ /dev/null @@ -1,143 +0,0 @@ -//go:build tools - -package main - -import ( - "fmt" - "go/ast" - "go/parser" - "go/token" - "io/ioutil" - "os" - "path/filepath" - "strings" -) - -func main() { - sqlDir := "internal/data/sql" - domainDir := "internal/domain" - - files, err := ioutil.ReadDir(sqlDir) - if err != nil { - fmt.Println("Error reading sql directory:", err) - return - } - - for _, file := range files { - if strings.HasSuffix(file.Name(), "_repository.go") { - repoName := strings.TrimSuffix(file.Name(), "_repository.go") - repoInterfaceName := strings.Title(repoName) + "Repository" - domainPackageName := repoName - - // Create domain directory - domainRepoDir := filepath.Join(domainDir, domainPackageName) - if err := os.MkdirAll(domainRepoDir, 0755); err != nil { - fmt.Printf("Error creating directory %s: %v\n", domainRepoDir, err) - continue - } - - // Read the sql repository file - filePath := filepath.Join(sqlDir, file.Name()) - src, err := ioutil.ReadFile(filePath) - if err != nil { - fmt.Printf("Error reading file %s: %v\n", filePath, err) - continue - } - - // Parse the file - fset := token.NewFileSet() - node, err := parser.ParseFile(fset, "", src, parser.ParseComments) - if err != nil { - fmt.Printf("Error parsing file %s: %v\n", filePath, err) - continue - } - - // Find public methods - var methods []string - ast.Inspect(node, func(n ast.Node) bool { - if fn, ok := n.(*ast.FuncDecl); ok { - if fn.Recv != nil && len(fn.Recv.List) > 0 { - if star, ok := fn.Recv.List[0].Type.(*ast.StarExpr); ok { - if ident, ok := star.X.(*ast.Ident); ok { - if strings.HasSuffix(ident.Name, "Repository") && fn.Name.IsExported() { - methods = append(methods, getFuncSignature(fn)) - } - } - } - } - } - return true - }) - - // Create the repo.go file - repoFilePath := filepath.Join(domainRepoDir, "repo.go") - repoFileContent := fmt.Sprintf(`package %s - -import ( - "context" - "tercul/internal/domain" -) - -// %s defines CRUD methods specific to %s. -type %s interface { - domain.BaseRepository[domain.%s] -%s -} -`, domainPackageName, repoInterfaceName, strings.Title(repoName), repoInterfaceName, strings.Title(repoName), formatMethods(methods)) - - if err := ioutil.WriteFile(repoFilePath, []byte(repoFileContent), 0644); err != nil { - fmt.Printf("Error writing file %s: %v\n", repoFilePath, err) - } else { - fmt.Printf("Created %s\n", repoFilePath) - } - } - } -} - -func getFuncSignature(fn *ast.FuncDecl) string { - params := "" - for _, p := range fn.Type.Params.List { - if len(p.Names) > 0 { - params += p.Names[0].Name + " " - } - params += getTypeString(p.Type) + ", " - } - if len(params) > 0 { - params = params[:len(params)-2] - } - - results := "" - if fn.Type.Results != nil { - for _, r := range fn.Type.Results.List { - results += getTypeString(r.Type) + ", " - } - if len(results) > 0 { - results = "(" + results[:len(results)-2] + ")" - } - } - return fmt.Sprintf("\t%s(%s) %s", fn.Name.Name, params, results) -} - -func getTypeString(expr ast.Expr) string { - switch t := expr.(type) { - case *ast.Ident: - return t.Name - case *ast.SelectorExpr: - return getTypeString(t.X) + "." + t.Sel.Name - case *ast.StarExpr: - return "*" + getTypeString(t.X) - case *ast.ArrayType: - return "[]" + getTypeString(t.Elt) - case *ast.InterfaceType: - return "interface{}" - default: - return "" - } -} - -func formatMethods(methods []string) string { - if len(methods) == 0 { - return "" - } - return "\n" + strings.Join(methods, "\n") -} diff --git a/fix_domain_repos.go b/fix_domain_repos.go deleted file mode 100644 index 050a833..0000000 --- a/fix_domain_repos.go +++ /dev/null @@ -1,51 +0,0 @@ -//go:build tools - -package main - -import ( - "fmt" - "io/ioutil" - "os" - "path/filepath" - "strings" -) - -func main() { - domainDir := "internal/domain" - - dirs, err := ioutil.ReadDir(domainDir) - if err != nil { - fmt.Println("Error reading domain directory:", err) - return - } - - for _, dir := range dirs { - if dir.IsDir() { - repoFilePath := filepath.Join(domainDir, dir.Name(), "repo.go") - if _, err := os.Stat(repoFilePath); err == nil { - content, err := ioutil.ReadFile(repoFilePath) - if err != nil { - fmt.Printf("Error reading file %s: %v\n", repoFilePath, err) - continue - } - - newContent := strings.Replace(string(content), "domain.Base", "domain.BaseRepository", -1) - newContent = strings.Replace(newContent, "domain."+strings.Title(dir.Name()), "domain."+strings.Title(dir.Name()), -1) - - // Fix for names with underscore - newContent = strings.Replace(newContent, "domain.Copyright_claim", "domain.CopyrightClaim", -1) - newContent = strings.Replace(newContent, "domain.Email_verification", "domain.EmailVerification", -1) - newContent = strings.Replace(newContent, "domain.Password_reset", "domain.PasswordReset", -1) - newContent = strings.Replace(newContent, "domain.User_profile", "domain.UserProfile", -1) - newContent = strings.Replace(newContent, "domain.User_session", "domain.UserSession", -1) - - - if err := ioutil.WriteFile(repoFilePath, []byte(newContent), 0644); err != nil { - fmt.Printf("Error writing file %s: %v\n", repoFilePath, err) - } else { - fmt.Printf("Fixed repo %s\n", repoFilePath) - } - } - } - } -} diff --git a/fix_sql_imports.go b/fix_sql_imports.go deleted file mode 100644 index aaeca55..0000000 --- a/fix_sql_imports.go +++ /dev/null @@ -1,42 +0,0 @@ -//go:build tools - -package main - -import ( - "fmt" - "io/ioutil" - "path/filepath" - "strings" -) - -func main() { - sqlDir := "internal/data/sql" - - files, err := ioutil.ReadDir(sqlDir) - if err != nil { - fmt.Println("Error reading sql directory:", err) - return - } - - for _, file := range files { - if strings.HasSuffix(file.Name(), "_repository.go") { - repoName := strings.TrimSuffix(file.Name(), "_repository.go") - filePath := filepath.Join(sqlDir, file.Name()) - - content, err := ioutil.ReadFile(filePath) - if err != nil { - fmt.Printf("Error reading file %s: %v\n", filePath, err) - continue - } - - newContent := strings.Replace(string(content), `"tercul/internal/domain"`, fmt.Sprintf(`"%s"`, filepath.Join("tercul/internal/domain", repoName)), 1) - newContent = strings.Replace(newContent, "domain."+strings.Title(repoName)+"Repository", repoName+"."+strings.Title(repoName)+"Repository", 1) - - if err := ioutil.WriteFile(filePath, []byte(newContent), 0644); err != nil { - fmt.Printf("Error writing file %s: %v\n", filePath, err) - } else { - fmt.Printf("Fixed imports in %s\n", filePath) - } - } - } -} diff --git a/refactor.md b/refactor.md index 227edc5..219be74 100644 --- a/refactor.md +++ b/refactor.md @@ -84,11 +84,11 @@ Short, sharp audit. You’ve got good bones but too many cross-cutting seams: du # 2) Specific refactors (high ROI) -1. **Unify GraphQL** +1. **Unify GraphQL** `[COMPLETED]` -* Delete one of: `/graph` or `/graphql`. Keep **gqlgen** in `internal/adapters/graphql`. -* Put `schema.graphqls` there. Configure `gqlgen.yml` to output generated code in the same package. -* Resolvers should call `internal/app/*` use-cases (not repos), returning **read models** tailored for GraphQL. +* Delete one of: `/graph` or `/graphql`. Keep **gqlgen** in `internal/adapters/graphql`. `[COMPLETED]` +* Put `schema.graphqls` there. Configure `gqlgen.yml` to output generated code in the same package. `[COMPLETED]` +* Resolvers should call `internal/app/*` use-cases (not repos), returning **read models** tailored for GraphQL. `[COMPLETED]` 2. **Introduce Unit-of-Work (UoW) + Transaction boundaries** @@ -114,10 +114,10 @@ Short, sharp audit. You’ve got good bones but too many cross-cutting seams: du * Current `models/*.go` mixes everything. Group by aggregate (`work`, `author`, `user`, …). Co-locate value objects and invariants. Keep **constructors** that validate invariants (no anemic structs). -6. **Migrations** +6. **Migrations** `[COMPLETED]` -* Move raw SQL to `internal/data/migrations` (or `/migrations` at repo root) and adopt a tool (goose, atlas, migrate). Delete `migrations.go` hand-rollers. -* Version generated `tercul_schema.sql` as **snapshots** in `/ops/migration/outputs/` instead of in runtime code. +* Move raw SQL to `internal/data/migrations` (or `/migrations` at repo root) and adopt a tool (goose, atlas, migrate). Delete `migrations.go` hand-rollers. `[COMPLETED]` +* Version generated `tercul_schema.sql` as **snapshots** in `/ops/migration/outputs/` instead of in runtime code. `[COMPLETED]` 7. **Observability** diff --git a/report.md b/report.md deleted file mode 100644 index 19c85a6..0000000 --- a/report.md +++ /dev/null @@ -1,163 +0,0 @@ -# Tercul Go Application Analysis Report - -## Current Status - -### Overview -The Tercul backend is a Go-based application for literary text analysis and management. It uses a combination of technologies: - -1. **PostgreSQL with GORM**: For relational data storage -2. **Weaviate**: For vector search capabilities -3. **GraphQL with gqlgen**: For API layer -4. **Asynq with Redis**: For asynchronous job processing - -### Core Components - -#### 1. Data Models -The application has a comprehensive set of models organized in separate files in the `models` package, including: -- Core literary content: Work, Translation, Author, Book -- User interaction: Comment, Like, Bookmark, Collection, Contribution -- Analytics: WorkStats, TranslationStats, UserStats -- Linguistic analysis: TextMetadata, PoeticAnalysis, ReadabilityScore, LinguisticLayer -- Location: Country, City, Place, Address -- System: Notification, EditorialWorkflow, Copyright, CopyrightClaim - -The models use inheritance patterns with BaseModel and TranslatableModel providing common fields. The models are well-structured with appropriate relationships between entities. - -#### 2. Repositories -The application uses the repository pattern for data access: -- `GenericRepository`: Provides a generic implementation of CRUD operations using Go generics -- `WorkRepository`: CRUD operations for Work model -- Various other repositories for specific entity types - -The repositories provide a clean abstraction over the database operations. - -#### 3. Synchronization Jobs -The application includes a synchronization mechanism between PostgreSQL and Weaviate: -- `SyncJob`: Manages synchronization process -- `SyncAllEntities`: Syncs entities from PostgreSQL to Weaviate -- `SyncAllEdges`: Syncs edges (relationships) between entities - -The synchronization process uses Asynq for background job processing, allowing for scalable asynchronous operations. - -#### 4. Linguistic Analysis -The application includes a linguistic analysis system: -- `Analyzer` interface: Defines methods for text analysis -- `BasicAnalyzer`: Implements simple text analysis algorithms -- `LinguisticSyncJob`: Manages background jobs for linguistic analysis - -The linguistic analysis includes basic text statistics, readability metrics, keyword extraction, and sentiment analysis, though the implementations are simplified. - -#### 5. GraphQL API -The GraphQL API is well-defined with a comprehensive schema that includes types, queries, and mutations for all major entities. The schema supports operations like creating and updating works, translations, and authors, as well as social features like comments, likes, and bookmarks. - -## Areas for Improvement - -### 1. Performance Concerns - -1. **Lack of pagination in repositories**: Many repository methods retrieve all records without pagination, which could cause performance issues with large datasets. For example, the `List()` and `GetAllForSync()` methods in repositories return all records without any limit. - -2. **Raw SQL queries in entity synchronization**: The `syncEntities` function in `syncjob/entities_sync.go` uses raw SQL queries with string concatenation instead of GORM's structured query methods, which could lead to SQL injection vulnerabilities and is less efficient. - -3. **Loading all records at once**: The synchronization process loads all records of each entity type at once, which could cause memory issues with large datasets. There's no batching or pagination for large datasets. - -4. **No batching in Weaviate operations**: The Weaviate client doesn't use batching for operations, which could be inefficient for large datasets. Each entity is sent to Weaviate in a separate API call. - -5. **Inefficient linguistic analysis algorithms**: The linguistic analysis algorithms in `linguistics/analyzer.go` are very simplified and not optimized for performance. For example, the sentiment analysis algorithm checks each word against a small list of positive and negative words, which is inefficient. - -### 2. Security Concerns - -1. **Hardcoded database credentials**: The `main.go` file contains hardcoded database credentials, which is a security risk. These should be moved to environment variables or a secure configuration system. - -2. **SQL injection risk**: The `syncEntities` function in `syncjob/entities_sync.go` uses raw SQL queries with string concatenation, which could lead to SQL injection vulnerabilities. - -3. **No input validation**: There doesn't appear to be comprehensive input validation for GraphQL mutations, which could lead to data integrity issues or security vulnerabilities. - -4. **No rate limiting**: There's no rate limiting for API requests or background jobs, which could make the system vulnerable to denial-of-service attacks. - -### 3. Code Quality Issues - -1. **Incomplete Weaviate integration**: The Weaviate client in `weaviate/weaviate_client.go` only supports the Work model, not other models, which limits the search capabilities. - -2. **Simplified linguistic analysis**: The linguistic analysis algorithms in `linguistics/analyzer.go` are very basic and not suitable for production use. They use simplified approaches that don't leverage modern NLP techniques. - -3. **Hardcoded string mappings**: The `toSnakeCase` function in `syncjob/entities_sync.go` has hardcoded mappings for many entity types, which is not maintainable. - -### 4. Testing and Documentation - -1. **Lack of API documentation**: The GraphQL schema lacks documentation for types, queries, and mutations, which makes it harder for developers to use the API. - -2. **Missing code documentation**: Many functions and packages lack proper documentation, which makes the codebase harder to understand and maintain. - -3. **No performance benchmarks**: There are no performance benchmarks to identify bottlenecks and measure improvements. - -## Recommendations for Future Development - -### 1. Architecture Improvements - -1. **Implement a service layer**: Add a service layer between repositories and resolvers to encapsulate business logic and improve separation of concerns. This would include services for each domain entity (WorkService, UserService, etc.) that handle validation, business rules, and coordination between repositories. - -2. **Improve error handling**: Implement consistent error handling with proper error types and recovery mechanisms. Create custom error types for common scenarios (NotFoundError, ValidationError, etc.) and ensure errors are properly propagated and logged. - -3. **Add configuration management**: Use a proper configuration management system instead of hardcoded values. Implement a configuration struct that can be loaded from environment variables, config files, or other sources, with support for defaults and validation. - -4. **Implement a logging framework**: Use a structured logging framework for better observability. A library like zap or logrus would provide structured logging with different log levels, contextual information, and better performance than the standard log package. - -### 2. Performance Optimizations - -1. **Add pagination to all list operations**: Implement pagination for all repository methods that return lists. This would include adding page and pageSize parameters to List methods, calculating the total count, and returning both the paginated results and the total count. - -2. **Use GORM's structured query methods**: Replace raw SQL queries with GORM's structured query methods. Instead of using raw SQL queries with string concatenation, use GORM's Table(), Find(), Where(), and other methods to build queries in a structured and safe way. - -3. **Implement batching for Weaviate operations**: Use batching for Weaviate operations to reduce the number of API calls. Process entities in batches of a configurable size (e.g., 100) to reduce the number of API calls and improve performance. - -4. **Add caching for frequently accessed data**: Implement Redis caching for frequently accessed data. Use Redis to cache frequently accessed data like works, authors, and other entities, with appropriate TTL values and cache invalidation strategies. - -5. **Optimize linguistic analysis algorithms**: Replace simplified algorithms with more efficient implementations or use external NLP libraries. The current sentiment analysis and keyword extraction algorithms are very basic and inefficient. Use established NLP libraries like spaCy, NLTK, or specialized sentiment analysis libraries. - -6. **Implement database indexing**: Add appropriate indexes to database tables for better query performance. Add indexes to frequently queried fields like title, language, and foreign keys to improve query performance. - -### 3. Code Quality Enhancements - -1. **Add input validation**: Implement input validation for all GraphQL mutations. Validate required fields, field formats, and business rules before processing data to ensure data integrity and security. - -2. **Improve error messages**: Provide more descriptive error messages for better debugging. Include context information in error messages, distinguish between different types of errors (not found, validation, database, etc.), and use error wrapping to preserve the error chain. - -3. **Add code documentation**: Add comprehensive documentation to all packages and functions. Include descriptions of function purpose, parameters, return values, and examples where appropriate. Follow Go's documentation conventions for godoc compatibility. - -4. **Refactor duplicate code**: Identify and refactor duplicate code, especially in the synchronization process. Extract common functionality into reusable functions or methods, and consider using interfaces for common behavior patterns. - -### 4. Testing Improvements - -1. **Add integration tests**: Implement integration tests for the GraphQL API and background jobs. Test the entire request-response cycle for GraphQL queries and mutations, including error handling and validation. For background jobs, test the job enqueuing, processing, and completion. - -2. **Add performance tests**: Implement performance tests to identify bottlenecks. Use Go's built-in benchmarking tools to measure the performance of critical operations like database queries, synchronization processes, and linguistic analysis. Set performance baselines and monitor for regressions. - -### 5. Security Enhancements - -1. **Implement proper authentication**: Add JWT authentication with proper token validation. Implement a middleware that validates JWT tokens in the Authorization header, extracts user information from claims, and adds it to the request context for use in resolvers. - -2. **Add authorization checks**: Implement role-based access control for all operations. Add checks in resolvers to verify that the authenticated user has the appropriate role and permissions to perform the requested operation, especially for mutations that modify data. - -3. **Use environment variables for credentials**: Move hardcoded credentials to environment variables. Replace hardcoded database credentials, API keys, and other sensitive information with values loaded from environment variables or a secure configuration system. - -4. **Implement rate limiting**: Add rate limiting for API requests and background jobs. Use a rate limiting middleware to prevent abuse of the API, with configurable limits based on user role, IP address, or other criteria. Also implement rate limiting for background job processing to prevent resource exhaustion. - -## Conclusion - -The Tercul Go application has a solid foundation with a well-structured domain model, repository pattern, and GraphQL API. The application demonstrates good architectural decisions such as using background job processing for synchronization and having a modular design for linguistic analysis. - -A comprehensive suite of unit tests has been added for all models, repositories, and services, which significantly improves the code quality and will help prevent regressions. The password hashing for users has also been implemented. - -However, there are still several areas that need improvement: - -1. **Performance**: The application has potential performance issues with lack of pagination, inefficient database queries, and simplified algorithms. - -2. **Security**: There are security vulnerabilities such as hardcoded credentials and SQL injection risks in some parts of the application. - -3. **Code Quality**: The codebase has some inconsistencies in repository implementation, limited error handling, and incomplete features. - -4. **Testing**: While unit test coverage is now good, integration and performance tests are still lacking. - -By addressing these issues and implementing the recommended improvements, the Tercul Go application can become more robust, secure, and scalable. The most critical issues to address are implementing proper password hashing, adding pagination to list operations, improving error handling, and enhancing the linguistic analysis capabilities. - -The application has the potential to be a powerful platform for literary text analysis and management, but it requires significant development to reach production readiness. diff --git a/validate.py b/validate.py deleted file mode 100644 index 5bf7759..0000000 --- a/validate.py +++ /dev/null @@ -1,45 +0,0 @@ -import json -import os -from jsonschema import validate -from referencing import Registry, Resource -from referencing.jsonschema import DRAFT202012 - -def main(): - """ - Validates the example blog posts against the blog.json schema. - """ - schemas_dir = "schemas" - content_dir = "content/blog" - - # Create a resource for each schema - blog_schema_path = os.path.join(schemas_dir, "blog.json") - with open(blog_schema_path, "r") as f: - blog_schema_resource = Resource.from_contents(json.load(f), default_specification=DRAFT202012) - - defs_schema_path = os.path.join(schemas_dir, "_defs.json") - with open(defs_schema_path, "r") as f: - defs_schema_resource = Resource.from_contents(json.load(f), default_specification=DRAFT202012) - - # Create a registry and add the resources - registry = Registry().with_resources( - [ - ("blog.json", blog_schema_resource), - ("_defs.json", defs_schema_resource), - ] - ) - - # Validate each blog post - for filename in os.listdir(content_dir): - if filename.endswith(".json"): - filepath = os.path.join(content_dir, filename) - with open(filepath, "r") as f: - instance = json.load(f) - - try: - validate(instance=instance, schema=blog_schema_resource.contents, registry=registry) - print(f"Successfully validated {filename}") - except Exception as e: - print(f"Validation failed for {filename}: {e}") - -if __name__ == "__main__": - main()